@vespermcp/mcp-server 1.2.29 → 1.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ # Vesper MCP Migration Guide (Pre-Production)
2
+
3
+ This guide documents the MCP surface consolidation for Vesper as a general-purpose data layer.
4
+
5
+ ## What Changed
6
+
7
+ - Removed: `configure_kaggle`
8
+ - Merged quality tools into `quality_analyze`
9
+ - Merged fusion tools into `fuse`
10
+ - Merged lineage tools into `lineage`
11
+ - Kept separate by design: `vesper_normalize_schema` and `vesper_convert_format`
12
+
13
+ ## Deprecation Map
14
+
15
+ - `configure_kaggle` -> `configure_keys`
16
+ - Map: `username` -> `kaggle_username`, `key` -> `kaggle_key`
17
+
18
+ - `analyze_quality` -> `quality_analyze` with `operation="dataset"`
19
+ - `analyze_image_quality` -> `quality_analyze` with `operation="image"`
20
+ - `analyze_media_quality` -> `quality_analyze` with `operation="media"`
21
+ - `generate_quality_report` -> `quality_analyze` with `operation="report"`
22
+
23
+ - `fuse_datasets` -> `fuse` with `operation="tabular"`
24
+ - `vesper_fuse` -> `fuse` with `operation="web"`
25
+
26
+ - `get_lineage` -> `lineage` with `operation="get"`
27
+ - `diff_lineage_versions` -> `lineage` with `operation="diff"`
28
+
29
+ ## Migration Examples
30
+
31
+ ### Credentials
32
+
33
+ Old:
34
+
35
+ ```json
36
+ { "name": "configure_kaggle", "arguments": { "username": "u", "key": "k" } }
37
+ ```
38
+
39
+ New:
40
+
41
+ ```json
42
+ {
43
+ "name": "configure_keys",
44
+ "arguments": {
45
+ "kaggle_username": "u",
46
+ "kaggle_key": "k"
47
+ }
48
+ }
49
+ ```
50
+
51
+ ### Quality
52
+
53
+ Old:
54
+
55
+ ```json
56
+ { "name": "analyze_quality", "arguments": { "dataset_id": "my_ds" } }
57
+ ```
58
+
59
+ New:
60
+
61
+ ```json
62
+ {
63
+ "name": "quality_analyze",
64
+ "arguments": {
65
+ "operation": "dataset",
66
+ "dataset_id": "my_ds"
67
+ }
68
+ }
69
+ ```
70
+
71
+ ### Fusion
72
+
73
+ Old:
74
+
75
+ ```json
76
+ { "name": "fuse_datasets", "arguments": { "sources": ["a", "b"], "strategy": "concat" } }
77
+ ```
78
+
79
+ New:
80
+
81
+ ```json
82
+ {
83
+ "name": "fuse",
84
+ "arguments": {
85
+ "operation": "tabular",
86
+ "sources": ["a", "b"],
87
+ "strategy": "concat"
88
+ }
89
+ }
90
+ ```
91
+
92
+ ### Lineage
93
+
94
+ Old:
95
+
96
+ ```json
97
+ { "name": "get_lineage", "arguments": { "dataset_id": "my_ds" } }
98
+ ```
99
+
100
+ New:
101
+
102
+ ```json
103
+ {
104
+ "name": "lineage",
105
+ "arguments": {
106
+ "operation": "get",
107
+ "dataset_id": "my_ds"
108
+ }
109
+ }
110
+ ```
111
+
112
+ ## Notes for Agent Builders
113
+
114
+ - Prefer the new unified tools for all new integrations.
115
+ - Do not merge `vesper_normalize_schema` and `vesper_convert_format` unless your client can present explicit operation-specific schemas.
116
+ - If you have old prompts/tool maps, migrate now before production rollout.
package/build/index.js CHANGED
@@ -1493,55 +1493,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1493
1493
  required: ["query"],
1494
1494
  },
1495
1495
  },
1496
- {
1497
- name: "vesper.fuse",
1498
- description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
1499
- inputSchema: {
1500
- type: "object",
1501
- properties: {
1502
- sources: {
1503
- type: "array",
1504
- description: "Web sources to collect from, each with its own query.",
1505
- items: {
1506
- type: "object",
1507
- properties: {
1508
- type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
1509
- query: { type: "string", description: "Query for this source." },
1510
- max_results: { type: "number", description: "Max results for this source (optional)." },
1511
- min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
1512
- bucket: { type: "string", description: "S3 bucket (for type='s3')." },
1513
- path: { type: "string", description: "S3 prefix/path (for type='s3')." },
1514
- region: { type: "string", description: "AWS region (for type='s3')." },
1515
- credentials: {
1516
- type: "object",
1517
- description: "Pass-through AWS credentials (optional; not persisted).",
1518
- properties: {
1519
- accessKeyId: { type: "string" },
1520
- secretAccessKey: { type: "string" },
1521
- sessionToken: { type: "string" },
1522
- roleArn: { type: "string" },
1523
- }
1524
- },
1525
- },
1526
- required: ["type", "query"],
1527
- },
1528
- },
1529
- merge_strategy: {
1530
- type: "string",
1531
- enum: ["union", "dedup"],
1532
- description: "How to merge collected documents.",
1533
- },
1534
- deduplication: {
1535
- type: "string",
1536
- enum: ["semantic", "exact", "none"],
1537
- description: "How to deduplicate across sources.",
1538
- },
1539
- agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
1540
- pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
1541
- },
1542
- required: ["sources"],
1543
- },
1544
- },
1545
1496
  {
1546
1497
  name: "vesper.extract_web",
1547
1498
  description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
@@ -1636,18 +1587,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1636
1587
  required: ["dataset_id", "source"],
1637
1588
  },
1638
1589
  },
1639
- {
1640
- name: "configure_kaggle",
1641
- description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
1642
- inputSchema: {
1643
- type: "object",
1644
- properties: {
1645
- username: { type: "string", description: "Kaggle username" },
1646
- key: { type: "string", description: "Kaggle API key" }
1647
- },
1648
- required: ["username", "key"],
1649
- },
1650
- },
1651
1590
  {
1652
1591
  name: "configure_keys",
1653
1592
  description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
@@ -1676,17 +1615,29 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1676
1615
  },
1677
1616
  },
1678
1617
  {
1679
- name: "analyze_quality",
1680
- description: "Perform a deep quality check on a dataset. Returns a detailed report including duplicates, outliers, and schema issues.",
1618
+ name: "quality_analyze",
1619
+ description: "Unified quality tool. operation='dataset' (tabular quality), 'image', 'media', or 'report' (multimodal report).",
1681
1620
  inputSchema: {
1682
1621
  type: "object",
1683
1622
  properties: {
1623
+ operation: {
1624
+ type: "string",
1625
+ enum: ["dataset", "image", "media", "report"],
1626
+ description: "Quality analysis mode. Defaults to 'dataset'.",
1627
+ },
1684
1628
  dataset_id: {
1685
1629
  type: "string",
1686
- description: "The dataset ID to analyze.",
1630
+ description: "Dataset ID for operation='dataset' or operation='report'.",
1631
+ },
1632
+ dataset_path: {
1633
+ type: "string",
1634
+ description: "Absolute dataset directory path for operation='report'.",
1635
+ },
1636
+ path: {
1637
+ type: "string",
1638
+ description: "Absolute file/folder path for operation='image' or operation='media'.",
1687
1639
  },
1688
1640
  },
1689
- required: ["dataset_id"],
1690
1641
  },
1691
1642
  },
1692
1643
  {
@@ -1838,39 +1789,30 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1838
1789
  },
1839
1790
  },
1840
1791
  {
1841
- name: "get_lineage",
1842
- description: "Get version history and full lineage/provenance for a dataset (sources, steps, inputs/outputs, trigger metadata).",
1792
+ name: "lineage",
1793
+ description: "Unified lineage tool. operation='get' returns lineage/provenance history, operation='diff' compares two versions.",
1843
1794
  inputSchema: {
1844
1795
  type: "object",
1845
1796
  properties: {
1846
- dataset_id: {
1797
+ operation: {
1847
1798
  type: "string",
1848
- description: "Dataset ID (base or versioned, e.g. my_dataset or my_dataset_v2).",
1799
+ enum: ["get", "diff"],
1800
+ description: "Lineage operation. Defaults to 'get'.",
1849
1801
  },
1850
- },
1851
- required: ["dataset_id"],
1852
- },
1853
- },
1854
- {
1855
- name: "diff_lineage_versions",
1856
- description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
1857
- inputSchema: {
1858
- type: "object",
1859
- properties: {
1860
1802
  dataset_id: {
1861
1803
  type: "string",
1862
1804
  description: "Dataset ID (base or versioned).",
1863
1805
  },
1864
1806
  from_version: {
1865
1807
  type: "number",
1866
- description: "Source lineage version number (e.g., 1).",
1808
+ description: "Source lineage version number (required for operation='diff').",
1867
1809
  },
1868
1810
  to_version: {
1869
1811
  type: "number",
1870
- description: "Target lineage version number (e.g., 2).",
1812
+ description: "Target lineage version number (required for operation='diff').",
1871
1813
  },
1872
1814
  },
1873
- required: ["dataset_id", "from_version", "to_version"],
1815
+ required: ["dataset_id"],
1874
1816
  },
1875
1817
  },
1876
1818
  {
@@ -1945,109 +1887,55 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1945
1887
  },
1946
1888
  },
1947
1889
  {
1948
- name: "fuse_datasets",
1949
- description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
1890
+ name: "fuse",
1891
+ description: "Unified fusion tool. operation='tabular' for row/column dataset fusion, operation='web' for web-native multi-source fusion.",
1950
1892
  inputSchema: {
1951
1893
  type: "object",
1952
1894
  properties: {
1953
- sources: {
1954
- type: "array",
1955
- items: { type: "string" },
1956
- description: "List of dataset IDs and/or local file paths to fuse.",
1957
- },
1958
- strategy: {
1959
- type: "string",
1960
- enum: ["concat", "join"],
1961
- description: "Fusion strategy. concat appends rows; join merges on key(s).",
1962
- },
1963
- join_on: {
1964
- oneOf: [
1965
- { type: "string" },
1966
- { type: "array", items: { type: "string" } }
1967
- ],
1968
- description: "Join key(s). Required when strategy='join'.",
1969
- },
1970
- how: {
1971
- type: "string",
1972
- enum: ["inner", "left", "outer"],
1973
- description: "Join mode (only for strategy='join').",
1974
- },
1975
- dedup: {
1976
- type: "boolean",
1977
- description: "Drop exact duplicate rows after fusion.",
1978
- },
1979
- run_quality_after: {
1980
- type: "boolean",
1981
- description: "Run quality analysis on the fused output.",
1982
- },
1983
- leakage_check: {
1984
- type: "boolean",
1985
- description: "Run leakage/overlap checks across fused sources.",
1986
- },
1987
- output_format: {
1988
- type: "string",
1989
- enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
1990
- description: "Output format (default: parquet).",
1991
- },
1992
- compression: {
1895
+ operation: {
1993
1896
  type: "string",
1994
- enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
1995
- description: "Compression algorithm for binary outputs.",
1897
+ enum: ["tabular", "web"],
1898
+ description: "Fusion operation mode. Defaults to 'tabular'.",
1996
1899
  },
1997
- preview: {
1998
- type: "boolean",
1999
- description: "Generate a small preview CSV of fused output.",
1900
+ sources: {
1901
+ type: "array",
1902
+ description: "For tabular: dataset IDs/paths. For web: source query objects.",
1903
+ items: {
1904
+ oneOf: [
1905
+ { type: "string" },
1906
+ {
1907
+ type: "object",
1908
+ properties: {
1909
+ type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
1910
+ query: { type: "string" },
1911
+ max_results: { type: "number" },
1912
+ min_stars: { type: "number" },
1913
+ bucket: { type: "string" },
1914
+ path: { type: "string" },
1915
+ region: { type: "string" },
1916
+ },
1917
+ required: ["type", "query"],
1918
+ },
1919
+ ],
1920
+ },
2000
1921
  },
1922
+ strategy: { type: "string", enum: ["concat", "join"] },
1923
+ join_on: { oneOf: [{ type: "string" }, { type: "array", items: { type: "string" } }] },
1924
+ how: { type: "string", enum: ["inner", "left", "outer"] },
1925
+ dedup: { type: "boolean" },
1926
+ run_quality_after: { type: "boolean" },
1927
+ leakage_check: { type: "boolean" },
1928
+ output_format: { type: "string", enum: ["feather", "parquet", "csv", "jsonl", "arrow"] },
1929
+ compression: { type: "string", enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"] },
1930
+ preview: { type: "boolean" },
1931
+ merge_strategy: { type: "string", enum: ["union", "dedup"] },
1932
+ deduplication: { type: "string", enum: ["semantic", "exact", "none"] },
1933
+ agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
1934
+ pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
2001
1935
  },
2002
1936
  required: ["sources"],
2003
1937
  },
2004
1938
  },
2005
- {
2006
- name: "analyze_image_quality",
2007
- description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
2008
- inputSchema: {
2009
- type: "object",
2010
- properties: {
2011
- path: {
2012
- type: "string",
2013
- description: "Absolute path to the image file or folder.",
2014
- },
2015
- },
2016
- required: ["path"],
2017
- },
2018
- },
2019
- {
2020
- name: "analyze_media_quality",
2021
- description: "Analyze audio/video quality (sample rate, duration, FPS, corruption) for a folder or single file.",
2022
- inputSchema: {
2023
- type: "object",
2024
- properties: {
2025
- path: {
2026
- type: "string",
2027
- description: "Absolute path to the audio/video file or folder.",
2028
- },
2029
- },
2030
- required: ["path"],
2031
- },
2032
- },
2033
- {
2034
- name: "generate_quality_report",
2035
- description: "Generate a comprehensive unified quality report for a multimodal dataset (text, image, audio, video).",
2036
- inputSchema: {
2037
- type: "object",
2038
- properties: {
2039
- dataset_id: {
2040
- type: "string",
2041
- description: "Dataset identifier.",
2042
- },
2043
- dataset_path: {
2044
- type: "string",
2045
- description: "Absolute path to the dataset directory.",
2046
- },
2047
- },
2048
- required: ["dataset_id", "dataset_path"],
2049
- },
2050
- },
2051
1939
  ],
2052
1940
  };
2053
1941
  });
@@ -2114,6 +2002,101 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2114
2002
  markStepComplete(String(datasetId), String(step));
2115
2003
  }
2116
2004
  switch (request.params.name) {
2005
+ case "lineage":
2006
+ case "get_lineage":
2007
+ case "diff_lineage_versions": {
2008
+ const operation = request.params.name === "get_lineage"
2009
+ ? "get"
2010
+ : request.params.name === "diff_lineage_versions"
2011
+ ? "diff"
2012
+ : String(request.params.arguments?.operation || "get").toLowerCase();
2013
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2014
+ if (!datasetId) {
2015
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2016
+ }
2017
+ if (operation === "get") {
2018
+ const base = toBaseDatasetId(datasetId);
2019
+ const record = readLineageRecord(base);
2020
+ if (!record.versions || record.versions.length === 0) {
2021
+ return {
2022
+ content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
2023
+ };
2024
+ }
2025
+ return {
2026
+ content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
2027
+ };
2028
+ }
2029
+ if (operation !== "diff") {
2030
+ throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
2031
+ }
2032
+ const fromVersion = Number(request.params.arguments?.from_version);
2033
+ const toVersion = Number(request.params.arguments?.to_version);
2034
+ if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
2035
+ throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
2036
+ }
2037
+ if (!Number.isInteger(toVersion) || toVersion <= 0) {
2038
+ throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
2039
+ }
2040
+ const base = toBaseDatasetId(datasetId);
2041
+ const record = readLineageRecord(base);
2042
+ const fromV = record.versions.find((v) => v.version === fromVersion);
2043
+ const toV = record.versions.find((v) => v.version === toVersion);
2044
+ if (!fromV || !toV) {
2045
+ return {
2046
+ content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
2047
+ isError: true,
2048
+ };
2049
+ }
2050
+ const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
2051
+ ? fromV.output?.schema_after || fromV.output?.schema_before || {}
2052
+ : fromV.output?.schema_after || fromV.output?.schema_before || {};
2053
+ const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
2054
+ const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
2055
+ const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
2056
+ const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
2057
+ const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
2058
+ const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
2059
+ const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
2060
+ const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
2061
+ const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
2062
+ const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
2063
+ return {
2064
+ content: [{
2065
+ type: "text",
2066
+ text: JSON.stringify({
2067
+ dataset_id_base: base,
2068
+ from_version: fromVersion,
2069
+ to_version: toVersion,
2070
+ schema_diff: schemaDiff,
2071
+ row_count_delta: {
2072
+ from: fromRows,
2073
+ to: toRows,
2074
+ delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
2075
+ },
2076
+ steps_diff: {
2077
+ added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
2078
+ removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
2079
+ from_steps: Array.from(fromSteps),
2080
+ to_steps: Array.from(toSteps),
2081
+ },
2082
+ actor_diff: {
2083
+ changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
2084
+ String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
2085
+ from: {
2086
+ tool: fromV.triggered_by?.tool,
2087
+ agent_id: fromV.triggered_by?.agent_id,
2088
+ pipeline_id: fromV.triggered_by?.pipeline_id,
2089
+ },
2090
+ to: {
2091
+ tool: toV.triggered_by?.tool,
2092
+ agent_id: toV.triggered_by?.agent_id,
2093
+ pipeline_id: toV.triggered_by?.pipeline_id,
2094
+ },
2095
+ },
2096
+ }, null, 2),
2097
+ }],
2098
+ };
2099
+ }
2117
2100
  case "vesper_web_find": {
2118
2101
  hydrateExternalKeys();
2119
2102
  const query = String(request.params.arguments?.query || "").trim();
@@ -2598,20 +2581,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2598
2581
  };
2599
2582
  }
2600
2583
  }
2601
- case "configure_kaggle": {
2602
- const username = String(request.params.arguments?.username || "").trim();
2603
- const key = String(request.params.arguments?.key || "").trim();
2604
- if (!username || !key) {
2605
- throw new McpError(ErrorCode.InvalidParams, "username and key are required");
2606
- }
2607
- const r1 = secureKeys.set("kaggle_username", username);
2608
- const r2 = secureKeys.set("kaggle_key", key);
2609
- process.env.KAGGLE_USERNAME = username;
2610
- process.env.KAGGLE_KEY = key;
2611
- return {
2612
- content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
2613
- };
2614
- }
2615
2584
  case "configure_keys": {
2616
2585
  const hfToken = String(request.params.arguments?.hf_token || "").trim();
2617
2586
  const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
@@ -2717,8 +2686,56 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2717
2686
  const formattedOutput = formatDatasetInfo(dataset);
2718
2687
  return { content: [{ type: "text", text: formattedOutput }] };
2719
2688
  }
2720
- case "analyze_quality": {
2721
- const datasetId = String(request.params.arguments?.dataset_id);
2689
+ case "quality_analyze":
2690
+ case "analyze_quality":
2691
+ case "analyze_image_quality":
2692
+ case "analyze_media_quality":
2693
+ case "generate_quality_report": {
2694
+ const resolvedOperation = request.params.name === "analyze_image_quality"
2695
+ ? "image"
2696
+ : request.params.name === "analyze_media_quality"
2697
+ ? "media"
2698
+ : request.params.name === "generate_quality_report"
2699
+ ? "report"
2700
+ : String(request.params.arguments?.operation || "dataset").toLowerCase();
2701
+ if (resolvedOperation === "image") {
2702
+ const inputPath = String(request.params.arguments?.path || "").trim();
2703
+ if (!inputPath || !fs.existsSync(inputPath)) {
2704
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2705
+ }
2706
+ const report = await imageAnalyzer.analyze(inputPath);
2707
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2708
+ }
2709
+ if (resolvedOperation === "media") {
2710
+ const inputPath = String(request.params.arguments?.path || "").trim();
2711
+ if (!inputPath || !fs.existsSync(inputPath)) {
2712
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2713
+ }
2714
+ const report = await mediaAnalyzer.analyze(inputPath);
2715
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2716
+ }
2717
+ if (resolvedOperation === "report") {
2718
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2719
+ const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
2720
+ if (!datasetId) {
2721
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
2722
+ }
2723
+ if (!datasetPath || !fs.existsSync(datasetPath)) {
2724
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2725
+ }
2726
+ const metadata = await metadataStore.getDataset(datasetId);
2727
+ const textQuality = null;
2728
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2729
+ if (metadata) {
2730
+ metadata.unified_quality_report = report;
2731
+ await metadataStore.saveDataset(metadata);
2732
+ }
2733
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2734
+ }
2735
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2736
+ if (!datasetId) {
2737
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
2738
+ }
2722
2739
  const safeId = toSafeDatasetPathFragment(datasetId);
2723
2740
  const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2724
2741
  const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
@@ -3123,100 +3140,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
3123
3140
  content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
3124
3141
  };
3125
3142
  }
3126
- case "get_lineage": {
3127
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
3128
- if (!datasetId) {
3129
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
3130
- }
3131
- const base = toBaseDatasetId(datasetId);
3132
- const record = readLineageRecord(base);
3133
- if (!record.versions || record.versions.length === 0) {
3134
- return {
3135
- content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
3136
- };
3137
- }
3138
- return {
3139
- content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
3140
- };
3141
- }
3142
- case "diff_lineage_versions": {
3143
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
3144
- const fromVersion = Number(request.params.arguments?.from_version);
3145
- const toVersion = Number(request.params.arguments?.to_version);
3146
- if (!datasetId) {
3147
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
3148
- }
3149
- if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
3150
- throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
3151
- }
3152
- if (!Number.isInteger(toVersion) || toVersion <= 0) {
3153
- throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
3154
- }
3155
- const base = toBaseDatasetId(datasetId);
3156
- const record = readLineageRecord(base);
3157
- const fromV = record.versions.find((v) => v.version === fromVersion);
3158
- const toV = record.versions.find((v) => v.version === toVersion);
3159
- if (!fromV || !toV) {
3160
- return {
3161
- content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
3162
- isError: true,
3163
- };
3164
- }
3165
- const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
3166
- ? fromV.output?.schema_after || fromV.output?.schema_before || {}
3167
- : fromV.output?.schema_after || fromV.output?.schema_before || {};
3168
- const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
3169
- const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
3170
- const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
3171
- const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
3172
- const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
3173
- const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
3174
- const fromRows = typeof fromSchema.rows === "number"
3175
- ? fromSchema.rows
3176
- : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
3177
- const toRows = typeof toSchema.rows === "number"
3178
- ? toSchema.rows
3179
- : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
3180
- const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
3181
- const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
3182
- const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
3183
- const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
3184
- const actorDiff = {
3185
- changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
3186
- String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
3187
- from: {
3188
- tool: fromV.triggered_by?.tool,
3189
- agent_id: fromV.triggered_by?.agent_id,
3190
- pipeline_id: fromV.triggered_by?.pipeline_id,
3191
- },
3192
- to: {
3193
- tool: toV.triggered_by?.tool,
3194
- agent_id: toV.triggered_by?.agent_id,
3195
- pipeline_id: toV.triggered_by?.pipeline_id,
3196
- },
3197
- };
3198
- const diffResult = {
3199
- dataset_id_base: base,
3200
- from_version: fromVersion,
3201
- to_version: toVersion,
3202
- schema_diff: schemaDiff,
3203
- row_count_delta: {
3204
- from: fromRows,
3205
- to: toRows,
3206
- delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
3207
- },
3208
- steps_diff: {
3209
- added: addedSteps,
3210
- removed: removedSteps,
3211
- from_steps: Array.from(fromSteps),
3212
- to_steps: Array.from(toSteps),
3213
- },
3214
- actor_diff: actorDiff,
3215
- };
3216
- return {
3217
- content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
3218
- };
3219
- }
3220
3143
  case "vesper_convert_format": {
3221
3144
  const filePath = String(request.params.arguments?.file_path || "").trim();
3222
3145
  const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
@@ -3379,7 +3302,57 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
3379
3302
  return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
3380
3303
  }
3381
3304
  }
3305
+ case "fuse":
3382
3306
  case "fuse_datasets": {
3307
+ const operation = request.params.name === "fuse_datasets"
3308
+ ? "tabular"
3309
+ : String(request.params.arguments?.operation || "tabular").toLowerCase();
3310
+ if (operation === "web") {
3311
+ hydrateExternalKeys();
3312
+ const webSources = Array.isArray(request.params.arguments?.sources)
3313
+ ? request.params.arguments?.sources
3314
+ : undefined;
3315
+ if (!webSources || !Array.isArray(webSources)) {
3316
+ return {
3317
+ content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
3318
+ isError: true,
3319
+ };
3320
+ }
3321
+ const mergeStrategyRaw = request.params.arguments?.merge_strategy
3322
+ ? String(request.params.arguments?.merge_strategy).toLowerCase()
3323
+ : undefined;
3324
+ const dedupRaw = request.params.arguments?.deduplication
3325
+ ? String(request.params.arguments?.deduplication).toLowerCase()
3326
+ : undefined;
3327
+ const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
3328
+ ? mergeStrategyRaw
3329
+ : undefined;
3330
+ const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
3331
+ ? dedupRaw
3332
+ : undefined;
3333
+ const webResult = await webFusionEngine.fuse({
3334
+ sources: webSources.map((s) => ({
3335
+ type: String(s?.type || "").trim().toLowerCase(),
3336
+ query: String(s?.query || "").trim(),
3337
+ max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
3338
+ min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
3339
+ bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
3340
+ path: s?.path !== undefined ? String(s.path) : undefined,
3341
+ region: s?.region !== undefined ? String(s.region) : undefined,
3342
+ credentials: s?.credentials ? {
3343
+ accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
3344
+ secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
3345
+ sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
3346
+ roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
3347
+ } : undefined,
3348
+ })),
3349
+ merge_strategy,
3350
+ deduplication,
3351
+ });
3352
+ return {
3353
+ content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
3354
+ };
3355
+ }
3383
3356
  const rawSources = request.params.arguments?.sources;
3384
3357
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
3385
3358
  throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
@@ -3493,142 +3466,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
3493
3466
  };
3494
3467
  }
3495
3468
  }
3496
- case "analyze_image_quality": {
3497
- const inputPath = String(request.params.arguments?.path);
3498
- if (!fs.existsSync(inputPath)) {
3499
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
3500
- }
3501
- try {
3502
- const report = await imageAnalyzer.analyze(inputPath);
3503
- let output = `## Image Quality Report\n\n`;
3504
- output += `- **Total Images**: ${report.total_images}\n`;
3505
- output += `- **Corrupted**: ${report.corrupted_count}\n`;
3506
- output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
3507
- output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
3508
- if (report.individual_results.length > 0) {
3509
- output += `### Sample Detail (Top 5)\n`;
3510
- report.individual_results.slice(0, 5).forEach(img => {
3511
- const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
3512
- output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
3513
- });
3514
- }
3515
- return {
3516
- content: [{ type: "text", text: output }]
3517
- };
3518
- }
3519
- catch (error) {
3520
- return {
3521
- content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
3522
- isError: true
3523
- };
3524
- }
3525
- }
3526
- case "analyze_media_quality": {
3527
- const inputPath = String(request.params.arguments?.path);
3528
- if (!fs.existsSync(inputPath)) {
3529
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
3530
- }
3531
- try {
3532
- const report = await mediaAnalyzer.analyze(inputPath);
3533
- let output = `## Media Quality Report\n\n`;
3534
- output += `- **Total Files**: ${report.total_files}\n`;
3535
- output += `- **OK Files**: ${report.ok_files}\n`;
3536
- output += `- **Failed Files**: ${report.failed_files}\n`;
3537
- if ('avg_audio_duration' in report && report.avg_audio_duration) {
3538
- output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
3539
- }
3540
- if ('avg_video_duration' in report && report.avg_video_duration) {
3541
- output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
3542
- output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
3543
- }
3544
- output += `\n### Sample Detail (Top 5)\n`;
3545
- report.details.slice(0, 5).forEach(item => {
3546
- const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
3547
- if (item.type === "audio" && 'sample_rate' in item) {
3548
- output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
3549
- }
3550
- else if (item.type === "video" && 'width' in item) {
3551
- output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
3552
- }
3553
- else {
3554
- output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
3555
- }
3556
- });
3557
- return {
3558
- content: [{ type: "text", text: output }]
3559
- };
3560
- }
3561
- catch (error) {
3562
- return {
3563
- content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
3564
- isError: true
3565
- };
3566
- }
3567
- }
3568
- case "generate_quality_report": {
3569
- const datasetId = String(request.params.arguments?.dataset_id);
3570
- const datasetPath = String(request.params.arguments?.dataset_path);
3571
- if (!fs.existsSync(datasetPath)) {
3572
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
3573
- }
3574
- try {
3575
- // Optionally load text quality from metadata if available
3576
- const metadata = await metadataStore.getDataset(datasetId);
3577
- // TODO: Integrate text quality analysis when available
3578
- const textQuality = null;
3579
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
3580
- // Save report to metadata
3581
- if (metadata) {
3582
- metadata.unified_quality_report = report;
3583
- await metadataStore.saveDataset(metadata);
3584
- }
3585
- let output = `# Unified Quality Report\n\n`;
3586
- output += `**Dataset**: ${datasetId}\n`;
3587
- output += `**Modalities**: ${report.modalities.join(", ")}\n`;
3588
- output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
3589
- if (report.text_quality) {
3590
- output += `## Text Quality\n`;
3591
- output += `- Rows: ${report.text_quality.row_count}\n`;
3592
- output += `- Columns: ${report.text_quality.column_count}\n`;
3593
- output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
3594
- output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
3595
- }
3596
- if (report.image_quality) {
3597
- output += `## Image Quality\n`;
3598
- output += `- Total Images: ${report.image_quality.total_images}\n`;
3599
- output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
3600
- output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
3601
- output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
3602
- }
3603
- if (report.audio_quality) {
3604
- output += `## Audio Quality\n`;
3605
- output += `- Total Files: ${report.audio_quality.total_files}\n`;
3606
- output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
3607
- output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
3608
- output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
3609
- }
3610
- if (report.video_quality) {
3611
- output += `## Video Quality\n`;
3612
- output += `- Total Files: ${report.video_quality.total_files}\n`;
3613
- output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
3614
- output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
3615
- output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
3616
- }
3617
- output += `## Recommendations\n`;
3618
- report.recommendations.forEach(rec => {
3619
- output += `- ${rec}\n`;
3620
- });
3621
- return {
3622
- content: [{ type: "text", text: output }]
3623
- };
3624
- }
3625
- catch (error) {
3626
- return {
3627
- content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
3628
- isError: true
3629
- };
3630
- }
3631
- }
3632
3469
  default:
3633
3470
  throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
3634
3471
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.29",
3
+ "version": "1.2.30",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",