@vespermcp/mcp-server 1.2.28 → 1.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.MCP_MIGRATION.md +116 -0
- package/build/index.js +306 -421
- package/build/python/cleaner.py +2 -0
- package/package.json +1 -1
- package/src/python/cleaner.py +2 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Vesper MCP Migration Guide (Pre-Production)
|
|
2
|
+
|
|
3
|
+
This guide documents the MCP surface consolidation for Vesper as a general-purpose data layer.
|
|
4
|
+
|
|
5
|
+
## What Changed
|
|
6
|
+
|
|
7
|
+
- Removed: `configure_kaggle`
|
|
8
|
+
- Merged quality tools into `quality_analyze`
|
|
9
|
+
- Merged fusion tools into `fuse`
|
|
10
|
+
- Merged lineage tools into `lineage`
|
|
11
|
+
- Kept separate by design: `vesper_normalize_schema` and `vesper_convert_format`
|
|
12
|
+
|
|
13
|
+
## Deprecation Map
|
|
14
|
+
|
|
15
|
+
- `configure_kaggle` -> `configure_keys`
|
|
16
|
+
- Map: `username` -> `kaggle_username`, `key` -> `kaggle_key`
|
|
17
|
+
|
|
18
|
+
- `analyze_quality` -> `quality_analyze` with `operation="dataset"`
|
|
19
|
+
- `analyze_image_quality` -> `quality_analyze` with `operation="image"`
|
|
20
|
+
- `analyze_media_quality` -> `quality_analyze` with `operation="media"`
|
|
21
|
+
- `generate_quality_report` -> `quality_analyze` with `operation="report"`
|
|
22
|
+
|
|
23
|
+
- `fuse_datasets` -> `fuse` with `operation="tabular"`
|
|
24
|
+
- `vesper_fuse` -> `fuse` with `operation="web"`
|
|
25
|
+
|
|
26
|
+
- `get_lineage` -> `lineage` with `operation="get"`
|
|
27
|
+
- `diff_lineage_versions` -> `lineage` with `operation="diff"`
|
|
28
|
+
|
|
29
|
+
## Migration Examples
|
|
30
|
+
|
|
31
|
+
### Credentials
|
|
32
|
+
|
|
33
|
+
Old:
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{ "name": "configure_kaggle", "arguments": { "username": "u", "key": "k" } }
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
New:
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"name": "configure_keys",
|
|
44
|
+
"arguments": {
|
|
45
|
+
"kaggle_username": "u",
|
|
46
|
+
"kaggle_key": "k"
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Quality
|
|
52
|
+
|
|
53
|
+
Old:
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{ "name": "analyze_quality", "arguments": { "dataset_id": "my_ds" } }
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
New:
|
|
60
|
+
|
|
61
|
+
```json
|
|
62
|
+
{
|
|
63
|
+
"name": "quality_analyze",
|
|
64
|
+
"arguments": {
|
|
65
|
+
"operation": "dataset",
|
|
66
|
+
"dataset_id": "my_ds"
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Fusion
|
|
72
|
+
|
|
73
|
+
Old:
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{ "name": "fuse_datasets", "arguments": { "sources": ["a", "b"], "strategy": "concat" } }
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
New:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"name": "fuse",
|
|
84
|
+
"arguments": {
|
|
85
|
+
"operation": "tabular",
|
|
86
|
+
"sources": ["a", "b"],
|
|
87
|
+
"strategy": "concat"
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Lineage
|
|
93
|
+
|
|
94
|
+
Old:
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{ "name": "get_lineage", "arguments": { "dataset_id": "my_ds" } }
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
New:
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"name": "lineage",
|
|
105
|
+
"arguments": {
|
|
106
|
+
"operation": "get",
|
|
107
|
+
"dataset_id": "my_ds"
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Notes for Agent Builders
|
|
113
|
+
|
|
114
|
+
- Prefer the new unified tools for all new integrations.
|
|
115
|
+
- Do not merge `vesper_normalize_schema` and `vesper_convert_format` unless your client can present explicit operation-specific schemas.
|
|
116
|
+
- If you have old prompts/tool maps, migrate now before production rollout.
|
package/build/index.js
CHANGED
|
@@ -339,6 +339,36 @@ function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
|
|
|
339
339
|
}
|
|
340
340
|
return stagedPath;
|
|
341
341
|
}
|
|
342
|
+
function cleanupIntermediateArtifacts(artifactPaths, finalOutputPath) {
|
|
343
|
+
const finalResolved = path.resolve(finalOutputPath);
|
|
344
|
+
const finalLineage = `${finalResolved}.lineage.json`;
|
|
345
|
+
for (const candidate of artifactPaths) {
|
|
346
|
+
if (!candidate)
|
|
347
|
+
continue;
|
|
348
|
+
const resolved = path.resolve(candidate);
|
|
349
|
+
if (resolved === finalResolved || resolved === finalLineage)
|
|
350
|
+
continue;
|
|
351
|
+
try {
|
|
352
|
+
if (fs.existsSync(resolved) && fs.statSync(resolved).isFile()) {
|
|
353
|
+
fs.unlinkSync(resolved);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
catch {
|
|
357
|
+
// Best-effort cleanup.
|
|
358
|
+
}
|
|
359
|
+
const sidecar = `${resolved}.lineage.json`;
|
|
360
|
+
if (sidecar === finalLineage)
|
|
361
|
+
continue;
|
|
362
|
+
try {
|
|
363
|
+
if (fs.existsSync(sidecar) && fs.statSync(sidecar).isFile()) {
|
|
364
|
+
fs.unlinkSync(sidecar);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
catch {
|
|
368
|
+
// Best-effort cleanup.
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
342
372
|
function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
|
|
343
373
|
if (fs.existsSync(datasetIdOrPath)) {
|
|
344
374
|
return ensureExportableLocalPath(datasetIdOrPath);
|
|
@@ -1463,55 +1493,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1463
1493
|
required: ["query"],
|
|
1464
1494
|
},
|
|
1465
1495
|
},
|
|
1466
|
-
{
|
|
1467
|
-
name: "vesper.fuse",
|
|
1468
|
-
description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
|
|
1469
|
-
inputSchema: {
|
|
1470
|
-
type: "object",
|
|
1471
|
-
properties: {
|
|
1472
|
-
sources: {
|
|
1473
|
-
type: "array",
|
|
1474
|
-
description: "Web sources to collect from, each with its own query.",
|
|
1475
|
-
items: {
|
|
1476
|
-
type: "object",
|
|
1477
|
-
properties: {
|
|
1478
|
-
type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
|
|
1479
|
-
query: { type: "string", description: "Query for this source." },
|
|
1480
|
-
max_results: { type: "number", description: "Max results for this source (optional)." },
|
|
1481
|
-
min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
|
|
1482
|
-
bucket: { type: "string", description: "S3 bucket (for type='s3')." },
|
|
1483
|
-
path: { type: "string", description: "S3 prefix/path (for type='s3')." },
|
|
1484
|
-
region: { type: "string", description: "AWS region (for type='s3')." },
|
|
1485
|
-
credentials: {
|
|
1486
|
-
type: "object",
|
|
1487
|
-
description: "Pass-through AWS credentials (optional; not persisted).",
|
|
1488
|
-
properties: {
|
|
1489
|
-
accessKeyId: { type: "string" },
|
|
1490
|
-
secretAccessKey: { type: "string" },
|
|
1491
|
-
sessionToken: { type: "string" },
|
|
1492
|
-
roleArn: { type: "string" },
|
|
1493
|
-
}
|
|
1494
|
-
},
|
|
1495
|
-
},
|
|
1496
|
-
required: ["type", "query"],
|
|
1497
|
-
},
|
|
1498
|
-
},
|
|
1499
|
-
merge_strategy: {
|
|
1500
|
-
type: "string",
|
|
1501
|
-
enum: ["union", "dedup"],
|
|
1502
|
-
description: "How to merge collected documents.",
|
|
1503
|
-
},
|
|
1504
|
-
deduplication: {
|
|
1505
|
-
type: "string",
|
|
1506
|
-
enum: ["semantic", "exact", "none"],
|
|
1507
|
-
description: "How to deduplicate across sources.",
|
|
1508
|
-
},
|
|
1509
|
-
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1510
|
-
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1511
|
-
},
|
|
1512
|
-
required: ["sources"],
|
|
1513
|
-
},
|
|
1514
|
-
},
|
|
1515
1496
|
{
|
|
1516
1497
|
name: "vesper.extract_web",
|
|
1517
1498
|
description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
|
|
@@ -1606,18 +1587,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1606
1587
|
required: ["dataset_id", "source"],
|
|
1607
1588
|
},
|
|
1608
1589
|
},
|
|
1609
|
-
{
|
|
1610
|
-
name: "configure_kaggle",
|
|
1611
|
-
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
1612
|
-
inputSchema: {
|
|
1613
|
-
type: "object",
|
|
1614
|
-
properties: {
|
|
1615
|
-
username: { type: "string", description: "Kaggle username" },
|
|
1616
|
-
key: { type: "string", description: "Kaggle API key" }
|
|
1617
|
-
},
|
|
1618
|
-
required: ["username", "key"],
|
|
1619
|
-
},
|
|
1620
|
-
},
|
|
1621
1590
|
{
|
|
1622
1591
|
name: "configure_keys",
|
|
1623
1592
|
description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
|
|
@@ -1646,17 +1615,29 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1646
1615
|
},
|
|
1647
1616
|
},
|
|
1648
1617
|
{
|
|
1649
|
-
name: "
|
|
1650
|
-
description: "
|
|
1618
|
+
name: "quality_analyze",
|
|
1619
|
+
description: "Unified quality tool. operation='dataset' (tabular quality), 'image', 'media', or 'report' (multimodal report).",
|
|
1651
1620
|
inputSchema: {
|
|
1652
1621
|
type: "object",
|
|
1653
1622
|
properties: {
|
|
1623
|
+
operation: {
|
|
1624
|
+
type: "string",
|
|
1625
|
+
enum: ["dataset", "image", "media", "report"],
|
|
1626
|
+
description: "Quality analysis mode. Defaults to 'dataset'.",
|
|
1627
|
+
},
|
|
1654
1628
|
dataset_id: {
|
|
1655
1629
|
type: "string",
|
|
1656
|
-
description: "
|
|
1630
|
+
description: "Dataset ID for operation='dataset' or operation='report'.",
|
|
1631
|
+
},
|
|
1632
|
+
dataset_path: {
|
|
1633
|
+
type: "string",
|
|
1634
|
+
description: "Absolute dataset directory path for operation='report'.",
|
|
1635
|
+
},
|
|
1636
|
+
path: {
|
|
1637
|
+
type: "string",
|
|
1638
|
+
description: "Absolute file/folder path for operation='image' or operation='media'.",
|
|
1657
1639
|
},
|
|
1658
1640
|
},
|
|
1659
|
-
required: ["dataset_id"],
|
|
1660
1641
|
},
|
|
1661
1642
|
},
|
|
1662
1643
|
{
|
|
@@ -1808,39 +1789,30 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1808
1789
|
},
|
|
1809
1790
|
},
|
|
1810
1791
|
{
|
|
1811
|
-
name: "
|
|
1812
|
-
description: "
|
|
1792
|
+
name: "lineage",
|
|
1793
|
+
description: "Unified lineage tool. operation='get' returns lineage/provenance history, operation='diff' compares two versions.",
|
|
1813
1794
|
inputSchema: {
|
|
1814
1795
|
type: "object",
|
|
1815
1796
|
properties: {
|
|
1816
|
-
|
|
1797
|
+
operation: {
|
|
1817
1798
|
type: "string",
|
|
1818
|
-
|
|
1799
|
+
enum: ["get", "diff"],
|
|
1800
|
+
description: "Lineage operation. Defaults to 'get'.",
|
|
1819
1801
|
},
|
|
1820
|
-
},
|
|
1821
|
-
required: ["dataset_id"],
|
|
1822
|
-
},
|
|
1823
|
-
},
|
|
1824
|
-
{
|
|
1825
|
-
name: "diff_lineage_versions",
|
|
1826
|
-
description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
|
|
1827
|
-
inputSchema: {
|
|
1828
|
-
type: "object",
|
|
1829
|
-
properties: {
|
|
1830
1802
|
dataset_id: {
|
|
1831
1803
|
type: "string",
|
|
1832
1804
|
description: "Dataset ID (base or versioned).",
|
|
1833
1805
|
},
|
|
1834
1806
|
from_version: {
|
|
1835
1807
|
type: "number",
|
|
1836
|
-
description: "Source lineage version number (
|
|
1808
|
+
description: "Source lineage version number (required for operation='diff').",
|
|
1837
1809
|
},
|
|
1838
1810
|
to_version: {
|
|
1839
1811
|
type: "number",
|
|
1840
|
-
description: "Target lineage version number (
|
|
1812
|
+
description: "Target lineage version number (required for operation='diff').",
|
|
1841
1813
|
},
|
|
1842
1814
|
},
|
|
1843
|
-
required: ["dataset_id"
|
|
1815
|
+
required: ["dataset_id"],
|
|
1844
1816
|
},
|
|
1845
1817
|
},
|
|
1846
1818
|
{
|
|
@@ -1915,109 +1887,55 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1915
1887
|
},
|
|
1916
1888
|
},
|
|
1917
1889
|
{
|
|
1918
|
-
name: "
|
|
1919
|
-
description: "
|
|
1890
|
+
name: "fuse",
|
|
1891
|
+
description: "Unified fusion tool. operation='tabular' for row/column dataset fusion, operation='web' for web-native multi-source fusion.",
|
|
1920
1892
|
inputSchema: {
|
|
1921
1893
|
type: "object",
|
|
1922
1894
|
properties: {
|
|
1923
|
-
|
|
1924
|
-
type: "array",
|
|
1925
|
-
items: { type: "string" },
|
|
1926
|
-
description: "List of dataset IDs and/or local file paths to fuse.",
|
|
1927
|
-
},
|
|
1928
|
-
strategy: {
|
|
1929
|
-
type: "string",
|
|
1930
|
-
enum: ["concat", "join"],
|
|
1931
|
-
description: "Fusion strategy. concat appends rows; join merges on key(s).",
|
|
1932
|
-
},
|
|
1933
|
-
join_on: {
|
|
1934
|
-
oneOf: [
|
|
1935
|
-
{ type: "string" },
|
|
1936
|
-
{ type: "array", items: { type: "string" } }
|
|
1937
|
-
],
|
|
1938
|
-
description: "Join key(s). Required when strategy='join'.",
|
|
1939
|
-
},
|
|
1940
|
-
how: {
|
|
1941
|
-
type: "string",
|
|
1942
|
-
enum: ["inner", "left", "outer"],
|
|
1943
|
-
description: "Join mode (only for strategy='join').",
|
|
1944
|
-
},
|
|
1945
|
-
dedup: {
|
|
1946
|
-
type: "boolean",
|
|
1947
|
-
description: "Drop exact duplicate rows after fusion.",
|
|
1948
|
-
},
|
|
1949
|
-
run_quality_after: {
|
|
1950
|
-
type: "boolean",
|
|
1951
|
-
description: "Run quality analysis on the fused output.",
|
|
1952
|
-
},
|
|
1953
|
-
leakage_check: {
|
|
1954
|
-
type: "boolean",
|
|
1955
|
-
description: "Run leakage/overlap checks across fused sources.",
|
|
1956
|
-
},
|
|
1957
|
-
output_format: {
|
|
1958
|
-
type: "string",
|
|
1959
|
-
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
1960
|
-
description: "Output format (default: parquet).",
|
|
1961
|
-
},
|
|
1962
|
-
compression: {
|
|
1895
|
+
operation: {
|
|
1963
1896
|
type: "string",
|
|
1964
|
-
enum: ["
|
|
1965
|
-
description: "
|
|
1897
|
+
enum: ["tabular", "web"],
|
|
1898
|
+
description: "Fusion operation mode. Defaults to 'tabular'.",
|
|
1966
1899
|
},
|
|
1967
|
-
|
|
1968
|
-
type: "
|
|
1969
|
-
description: "
|
|
1900
|
+
sources: {
|
|
1901
|
+
type: "array",
|
|
1902
|
+
description: "For tabular: dataset IDs/paths. For web: source query objects.",
|
|
1903
|
+
items: {
|
|
1904
|
+
oneOf: [
|
|
1905
|
+
{ type: "string" },
|
|
1906
|
+
{
|
|
1907
|
+
type: "object",
|
|
1908
|
+
properties: {
|
|
1909
|
+
type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
|
|
1910
|
+
query: { type: "string" },
|
|
1911
|
+
max_results: { type: "number" },
|
|
1912
|
+
min_stars: { type: "number" },
|
|
1913
|
+
bucket: { type: "string" },
|
|
1914
|
+
path: { type: "string" },
|
|
1915
|
+
region: { type: "string" },
|
|
1916
|
+
},
|
|
1917
|
+
required: ["type", "query"],
|
|
1918
|
+
},
|
|
1919
|
+
],
|
|
1920
|
+
},
|
|
1970
1921
|
},
|
|
1922
|
+
strategy: { type: "string", enum: ["concat", "join"] },
|
|
1923
|
+
join_on: { oneOf: [{ type: "string" }, { type: "array", items: { type: "string" } }] },
|
|
1924
|
+
how: { type: "string", enum: ["inner", "left", "outer"] },
|
|
1925
|
+
dedup: { type: "boolean" },
|
|
1926
|
+
run_quality_after: { type: "boolean" },
|
|
1927
|
+
leakage_check: { type: "boolean" },
|
|
1928
|
+
output_format: { type: "string", enum: ["feather", "parquet", "csv", "jsonl", "arrow"] },
|
|
1929
|
+
compression: { type: "string", enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"] },
|
|
1930
|
+
preview: { type: "boolean" },
|
|
1931
|
+
merge_strategy: { type: "string", enum: ["union", "dedup"] },
|
|
1932
|
+
deduplication: { type: "string", enum: ["semantic", "exact", "none"] },
|
|
1933
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1934
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1971
1935
|
},
|
|
1972
1936
|
required: ["sources"],
|
|
1973
1937
|
},
|
|
1974
1938
|
},
|
|
1975
|
-
{
|
|
1976
|
-
name: "analyze_image_quality",
|
|
1977
|
-
description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
|
|
1978
|
-
inputSchema: {
|
|
1979
|
-
type: "object",
|
|
1980
|
-
properties: {
|
|
1981
|
-
path: {
|
|
1982
|
-
type: "string",
|
|
1983
|
-
description: "Absolute path to the image file or folder.",
|
|
1984
|
-
},
|
|
1985
|
-
},
|
|
1986
|
-
required: ["path"],
|
|
1987
|
-
},
|
|
1988
|
-
},
|
|
1989
|
-
{
|
|
1990
|
-
name: "analyze_media_quality",
|
|
1991
|
-
description: "Analyze audio/video quality (sample rate, duration, FPS, corruption) for a folder or single file.",
|
|
1992
|
-
inputSchema: {
|
|
1993
|
-
type: "object",
|
|
1994
|
-
properties: {
|
|
1995
|
-
path: {
|
|
1996
|
-
type: "string",
|
|
1997
|
-
description: "Absolute path to the audio/video file or folder.",
|
|
1998
|
-
},
|
|
1999
|
-
},
|
|
2000
|
-
required: ["path"],
|
|
2001
|
-
},
|
|
2002
|
-
},
|
|
2003
|
-
{
|
|
2004
|
-
name: "generate_quality_report",
|
|
2005
|
-
description: "Generate a comprehensive unified quality report for a multimodal dataset (text, image, audio, video).",
|
|
2006
|
-
inputSchema: {
|
|
2007
|
-
type: "object",
|
|
2008
|
-
properties: {
|
|
2009
|
-
dataset_id: {
|
|
2010
|
-
type: "string",
|
|
2011
|
-
description: "Dataset identifier.",
|
|
2012
|
-
},
|
|
2013
|
-
dataset_path: {
|
|
2014
|
-
type: "string",
|
|
2015
|
-
description: "Absolute path to the dataset directory.",
|
|
2016
|
-
},
|
|
2017
|
-
},
|
|
2018
|
-
required: ["dataset_id", "dataset_path"],
|
|
2019
|
-
},
|
|
2020
|
-
},
|
|
2021
1939
|
],
|
|
2022
1940
|
};
|
|
2023
1941
|
});
|
|
@@ -2084,6 +2002,101 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2084
2002
|
markStepComplete(String(datasetId), String(step));
|
|
2085
2003
|
}
|
|
2086
2004
|
switch (request.params.name) {
|
|
2005
|
+
case "lineage":
|
|
2006
|
+
case "get_lineage":
|
|
2007
|
+
case "diff_lineage_versions": {
|
|
2008
|
+
const operation = request.params.name === "get_lineage"
|
|
2009
|
+
? "get"
|
|
2010
|
+
: request.params.name === "diff_lineage_versions"
|
|
2011
|
+
? "diff"
|
|
2012
|
+
: String(request.params.arguments?.operation || "get").toLowerCase();
|
|
2013
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2014
|
+
if (!datasetId) {
|
|
2015
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2016
|
+
}
|
|
2017
|
+
if (operation === "get") {
|
|
2018
|
+
const base = toBaseDatasetId(datasetId);
|
|
2019
|
+
const record = readLineageRecord(base);
|
|
2020
|
+
if (!record.versions || record.versions.length === 0) {
|
|
2021
|
+
return {
|
|
2022
|
+
content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
|
|
2023
|
+
};
|
|
2024
|
+
}
|
|
2025
|
+
return {
|
|
2026
|
+
content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
|
|
2027
|
+
};
|
|
2028
|
+
}
|
|
2029
|
+
if (operation !== "diff") {
|
|
2030
|
+
throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
|
|
2031
|
+
}
|
|
2032
|
+
const fromVersion = Number(request.params.arguments?.from_version);
|
|
2033
|
+
const toVersion = Number(request.params.arguments?.to_version);
|
|
2034
|
+
if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
|
|
2035
|
+
throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
|
|
2036
|
+
}
|
|
2037
|
+
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
2038
|
+
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
2039
|
+
}
|
|
2040
|
+
const base = toBaseDatasetId(datasetId);
|
|
2041
|
+
const record = readLineageRecord(base);
|
|
2042
|
+
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
2043
|
+
const toV = record.versions.find((v) => v.version === toVersion);
|
|
2044
|
+
if (!fromV || !toV) {
|
|
2045
|
+
return {
|
|
2046
|
+
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
2047
|
+
isError: true,
|
|
2048
|
+
};
|
|
2049
|
+
}
|
|
2050
|
+
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
2051
|
+
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
2052
|
+
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
2053
|
+
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
2054
|
+
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
2055
|
+
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
2056
|
+
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
2057
|
+
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
2058
|
+
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
2059
|
+
const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
2060
|
+
const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
2061
|
+
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
2062
|
+
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
2063
|
+
return {
|
|
2064
|
+
content: [{
|
|
2065
|
+
type: "text",
|
|
2066
|
+
text: JSON.stringify({
|
|
2067
|
+
dataset_id_base: base,
|
|
2068
|
+
from_version: fromVersion,
|
|
2069
|
+
to_version: toVersion,
|
|
2070
|
+
schema_diff: schemaDiff,
|
|
2071
|
+
row_count_delta: {
|
|
2072
|
+
from: fromRows,
|
|
2073
|
+
to: toRows,
|
|
2074
|
+
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
2075
|
+
},
|
|
2076
|
+
steps_diff: {
|
|
2077
|
+
added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
|
|
2078
|
+
removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
|
|
2079
|
+
from_steps: Array.from(fromSteps),
|
|
2080
|
+
to_steps: Array.from(toSteps),
|
|
2081
|
+
},
|
|
2082
|
+
actor_diff: {
|
|
2083
|
+
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
2084
|
+
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
2085
|
+
from: {
|
|
2086
|
+
tool: fromV.triggered_by?.tool,
|
|
2087
|
+
agent_id: fromV.triggered_by?.agent_id,
|
|
2088
|
+
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
2089
|
+
},
|
|
2090
|
+
to: {
|
|
2091
|
+
tool: toV.triggered_by?.tool,
|
|
2092
|
+
agent_id: toV.triggered_by?.agent_id,
|
|
2093
|
+
pipeline_id: toV.triggered_by?.pipeline_id,
|
|
2094
|
+
},
|
|
2095
|
+
},
|
|
2096
|
+
}, null, 2),
|
|
2097
|
+
}],
|
|
2098
|
+
};
|
|
2099
|
+
}
|
|
2087
2100
|
case "vesper_web_find": {
|
|
2088
2101
|
hydrateExternalKeys();
|
|
2089
2102
|
const query = String(request.params.arguments?.query || "").trim();
|
|
@@ -2568,20 +2581,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2568
2581
|
};
|
|
2569
2582
|
}
|
|
2570
2583
|
}
|
|
2571
|
-
case "configure_kaggle": {
|
|
2572
|
-
const username = String(request.params.arguments?.username || "").trim();
|
|
2573
|
-
const key = String(request.params.arguments?.key || "").trim();
|
|
2574
|
-
if (!username || !key) {
|
|
2575
|
-
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
2576
|
-
}
|
|
2577
|
-
const r1 = secureKeys.set("kaggle_username", username);
|
|
2578
|
-
const r2 = secureKeys.set("kaggle_key", key);
|
|
2579
|
-
process.env.KAGGLE_USERNAME = username;
|
|
2580
|
-
process.env.KAGGLE_KEY = key;
|
|
2581
|
-
return {
|
|
2582
|
-
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
2583
|
-
};
|
|
2584
|
-
}
|
|
2585
2584
|
case "configure_keys": {
|
|
2586
2585
|
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
2587
2586
|
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
@@ -2687,8 +2686,56 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2687
2686
|
const formattedOutput = formatDatasetInfo(dataset);
|
|
2688
2687
|
return { content: [{ type: "text", text: formattedOutput }] };
|
|
2689
2688
|
}
|
|
2690
|
-
case "
|
|
2691
|
-
|
|
2689
|
+
case "quality_analyze":
|
|
2690
|
+
case "analyze_quality":
|
|
2691
|
+
case "analyze_image_quality":
|
|
2692
|
+
case "analyze_media_quality":
|
|
2693
|
+
case "generate_quality_report": {
|
|
2694
|
+
const resolvedOperation = request.params.name === "analyze_image_quality"
|
|
2695
|
+
? "image"
|
|
2696
|
+
: request.params.name === "analyze_media_quality"
|
|
2697
|
+
? "media"
|
|
2698
|
+
: request.params.name === "generate_quality_report"
|
|
2699
|
+
? "report"
|
|
2700
|
+
: String(request.params.arguments?.operation || "dataset").toLowerCase();
|
|
2701
|
+
if (resolvedOperation === "image") {
|
|
2702
|
+
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2703
|
+
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2704
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2705
|
+
}
|
|
2706
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
2707
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2708
|
+
}
|
|
2709
|
+
if (resolvedOperation === "media") {
|
|
2710
|
+
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2711
|
+
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2712
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2713
|
+
}
|
|
2714
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2715
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2716
|
+
}
|
|
2717
|
+
if (resolvedOperation === "report") {
|
|
2718
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2719
|
+
const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
|
|
2720
|
+
if (!datasetId) {
|
|
2721
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
|
|
2722
|
+
}
|
|
2723
|
+
if (!datasetPath || !fs.existsSync(datasetPath)) {
|
|
2724
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2725
|
+
}
|
|
2726
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
2727
|
+
const textQuality = null;
|
|
2728
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2729
|
+
if (metadata) {
|
|
2730
|
+
metadata.unified_quality_report = report;
|
|
2731
|
+
await metadataStore.saveDataset(metadata);
|
|
2732
|
+
}
|
|
2733
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2734
|
+
}
|
|
2735
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2736
|
+
if (!datasetId) {
|
|
2737
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
|
|
2738
|
+
}
|
|
2692
2739
|
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2693
2740
|
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2694
2741
|
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
@@ -2898,6 +2945,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2898
2945
|
case "export_dataset": {
|
|
2899
2946
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2900
2947
|
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2948
|
+
const intermediateArtifacts = new Set();
|
|
2901
2949
|
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2902
2950
|
? String(request.params.arguments?.target_dir).trim()
|
|
2903
2951
|
: request.params.arguments?.output_dir
|
|
@@ -2967,9 +3015,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2967
3015
|
else if (currentExt !== pipelineFmt) {
|
|
2968
3016
|
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2969
3017
|
try {
|
|
3018
|
+
const beforeStagingPath = sourcePath;
|
|
2970
3019
|
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
|
|
3020
|
+
if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
|
|
3021
|
+
intermediateArtifacts.add(sourcePath);
|
|
3022
|
+
}
|
|
2971
3023
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2972
3024
|
if (pipelineResult.final_output_path) {
|
|
3025
|
+
if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
|
|
3026
|
+
intermediateArtifacts.add(pipelineResult.final_output_path);
|
|
3027
|
+
}
|
|
2973
3028
|
sourcePath = pipelineResult.final_output_path;
|
|
2974
3029
|
try {
|
|
2975
3030
|
// Update registry to point to pipeline's final output
|
|
@@ -3058,6 +3113,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
3058
3113
|
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
3059
3114
|
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
3060
3115
|
}
|
|
3116
|
+
cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
|
|
3061
3117
|
return { content: [{ type: "text", text: msg }] };
|
|
3062
3118
|
}
|
|
3063
3119
|
catch (error) {
|
|
@@ -3084,100 +3140,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
3084
3140
|
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
3085
3141
|
};
|
|
3086
3142
|
}
|
|
3087
|
-
case "get_lineage": {
|
|
3088
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3089
|
-
if (!datasetId) {
|
|
3090
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3091
|
-
}
|
|
3092
|
-
const base = toBaseDatasetId(datasetId);
|
|
3093
|
-
const record = readLineageRecord(base);
|
|
3094
|
-
if (!record.versions || record.versions.length === 0) {
|
|
3095
|
-
return {
|
|
3096
|
-
content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
|
|
3097
|
-
};
|
|
3098
|
-
}
|
|
3099
|
-
return {
|
|
3100
|
-
content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
|
|
3101
|
-
};
|
|
3102
|
-
}
|
|
3103
|
-
case "diff_lineage_versions": {
|
|
3104
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3105
|
-
const fromVersion = Number(request.params.arguments?.from_version);
|
|
3106
|
-
const toVersion = Number(request.params.arguments?.to_version);
|
|
3107
|
-
if (!datasetId) {
|
|
3108
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3109
|
-
}
|
|
3110
|
-
if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
|
|
3111
|
-
throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
|
|
3112
|
-
}
|
|
3113
|
-
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
3114
|
-
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
3115
|
-
}
|
|
3116
|
-
const base = toBaseDatasetId(datasetId);
|
|
3117
|
-
const record = readLineageRecord(base);
|
|
3118
|
-
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
3119
|
-
const toV = record.versions.find((v) => v.version === toVersion);
|
|
3120
|
-
if (!fromV || !toV) {
|
|
3121
|
-
return {
|
|
3122
|
-
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
3123
|
-
isError: true,
|
|
3124
|
-
};
|
|
3125
|
-
}
|
|
3126
|
-
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
3127
|
-
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
3128
|
-
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
3129
|
-
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
3130
|
-
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
3131
|
-
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
3132
|
-
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
3133
|
-
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
3134
|
-
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
3135
|
-
const fromRows = typeof fromSchema.rows === "number"
|
|
3136
|
-
? fromSchema.rows
|
|
3137
|
-
: (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
3138
|
-
const toRows = typeof toSchema.rows === "number"
|
|
3139
|
-
? toSchema.rows
|
|
3140
|
-
: (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
3141
|
-
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
3142
|
-
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
3143
|
-
const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
|
|
3144
|
-
const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
|
|
3145
|
-
const actorDiff = {
|
|
3146
|
-
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
3147
|
-
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
3148
|
-
from: {
|
|
3149
|
-
tool: fromV.triggered_by?.tool,
|
|
3150
|
-
agent_id: fromV.triggered_by?.agent_id,
|
|
3151
|
-
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
3152
|
-
},
|
|
3153
|
-
to: {
|
|
3154
|
-
tool: toV.triggered_by?.tool,
|
|
3155
|
-
agent_id: toV.triggered_by?.agent_id,
|
|
3156
|
-
pipeline_id: toV.triggered_by?.pipeline_id,
|
|
3157
|
-
},
|
|
3158
|
-
};
|
|
3159
|
-
const diffResult = {
|
|
3160
|
-
dataset_id_base: base,
|
|
3161
|
-
from_version: fromVersion,
|
|
3162
|
-
to_version: toVersion,
|
|
3163
|
-
schema_diff: schemaDiff,
|
|
3164
|
-
row_count_delta: {
|
|
3165
|
-
from: fromRows,
|
|
3166
|
-
to: toRows,
|
|
3167
|
-
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
3168
|
-
},
|
|
3169
|
-
steps_diff: {
|
|
3170
|
-
added: addedSteps,
|
|
3171
|
-
removed: removedSteps,
|
|
3172
|
-
from_steps: Array.from(fromSteps),
|
|
3173
|
-
to_steps: Array.from(toSteps),
|
|
3174
|
-
},
|
|
3175
|
-
actor_diff: actorDiff,
|
|
3176
|
-
};
|
|
3177
|
-
return {
|
|
3178
|
-
content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
|
|
3179
|
-
};
|
|
3180
|
-
}
|
|
3181
3143
|
case "vesper_convert_format": {
|
|
3182
3144
|
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
3183
3145
|
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
@@ -3340,7 +3302,57 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
3340
3302
|
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
|
|
3341
3303
|
}
|
|
3342
3304
|
}
|
|
3305
|
+
case "fuse":
|
|
3343
3306
|
case "fuse_datasets": {
|
|
3307
|
+
const operation = request.params.name === "fuse_datasets"
|
|
3308
|
+
? "tabular"
|
|
3309
|
+
: String(request.params.arguments?.operation || "tabular").toLowerCase();
|
|
3310
|
+
if (operation === "web") {
|
|
3311
|
+
hydrateExternalKeys();
|
|
3312
|
+
const webSources = Array.isArray(request.params.arguments?.sources)
|
|
3313
|
+
? request.params.arguments?.sources
|
|
3314
|
+
: undefined;
|
|
3315
|
+
if (!webSources || !Array.isArray(webSources)) {
|
|
3316
|
+
return {
|
|
3317
|
+
content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
|
|
3318
|
+
isError: true,
|
|
3319
|
+
};
|
|
3320
|
+
}
|
|
3321
|
+
const mergeStrategyRaw = request.params.arguments?.merge_strategy
|
|
3322
|
+
? String(request.params.arguments?.merge_strategy).toLowerCase()
|
|
3323
|
+
: undefined;
|
|
3324
|
+
const dedupRaw = request.params.arguments?.deduplication
|
|
3325
|
+
? String(request.params.arguments?.deduplication).toLowerCase()
|
|
3326
|
+
: undefined;
|
|
3327
|
+
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
3328
|
+
? mergeStrategyRaw
|
|
3329
|
+
: undefined;
|
|
3330
|
+
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
3331
|
+
? dedupRaw
|
|
3332
|
+
: undefined;
|
|
3333
|
+
const webResult = await webFusionEngine.fuse({
|
|
3334
|
+
sources: webSources.map((s) => ({
|
|
3335
|
+
type: String(s?.type || "").trim().toLowerCase(),
|
|
3336
|
+
query: String(s?.query || "").trim(),
|
|
3337
|
+
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
3338
|
+
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
3339
|
+
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
3340
|
+
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
3341
|
+
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
3342
|
+
credentials: s?.credentials ? {
|
|
3343
|
+
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
3344
|
+
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
3345
|
+
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
3346
|
+
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
3347
|
+
} : undefined,
|
|
3348
|
+
})),
|
|
3349
|
+
merge_strategy,
|
|
3350
|
+
deduplication,
|
|
3351
|
+
});
|
|
3352
|
+
return {
|
|
3353
|
+
content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
|
|
3354
|
+
};
|
|
3355
|
+
}
|
|
3344
3356
|
const rawSources = request.params.arguments?.sources;
|
|
3345
3357
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
3346
3358
|
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
@@ -3454,142 +3466,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
3454
3466
|
};
|
|
3455
3467
|
}
|
|
3456
3468
|
}
|
|
3457
|
-
case "analyze_image_quality": {
|
|
3458
|
-
const inputPath = String(request.params.arguments?.path);
|
|
3459
|
-
if (!fs.existsSync(inputPath)) {
|
|
3460
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
3461
|
-
}
|
|
3462
|
-
try {
|
|
3463
|
-
const report = await imageAnalyzer.analyze(inputPath);
|
|
3464
|
-
let output = `## Image Quality Report\n\n`;
|
|
3465
|
-
output += `- **Total Images**: ${report.total_images}\n`;
|
|
3466
|
-
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
3467
|
-
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
3468
|
-
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
3469
|
-
if (report.individual_results.length > 0) {
|
|
3470
|
-
output += `### Sample Detail (Top 5)\n`;
|
|
3471
|
-
report.individual_results.slice(0, 5).forEach(img => {
|
|
3472
|
-
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
3473
|
-
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
3474
|
-
});
|
|
3475
|
-
}
|
|
3476
|
-
return {
|
|
3477
|
-
content: [{ type: "text", text: output }]
|
|
3478
|
-
};
|
|
3479
|
-
}
|
|
3480
|
-
catch (error) {
|
|
3481
|
-
return {
|
|
3482
|
-
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
3483
|
-
isError: true
|
|
3484
|
-
};
|
|
3485
|
-
}
|
|
3486
|
-
}
|
|
3487
|
-
case "analyze_media_quality": {
|
|
3488
|
-
const inputPath = String(request.params.arguments?.path);
|
|
3489
|
-
if (!fs.existsSync(inputPath)) {
|
|
3490
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
3491
|
-
}
|
|
3492
|
-
try {
|
|
3493
|
-
const report = await mediaAnalyzer.analyze(inputPath);
|
|
3494
|
-
let output = `## Media Quality Report\n\n`;
|
|
3495
|
-
output += `- **Total Files**: ${report.total_files}\n`;
|
|
3496
|
-
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
3497
|
-
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
3498
|
-
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
3499
|
-
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
3500
|
-
}
|
|
3501
|
-
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
3502
|
-
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
3503
|
-
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
3504
|
-
}
|
|
3505
|
-
output += `\n### Sample Detail (Top 5)\n`;
|
|
3506
|
-
report.details.slice(0, 5).forEach(item => {
|
|
3507
|
-
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
3508
|
-
if (item.type === "audio" && 'sample_rate' in item) {
|
|
3509
|
-
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
3510
|
-
}
|
|
3511
|
-
else if (item.type === "video" && 'width' in item) {
|
|
3512
|
-
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
3513
|
-
}
|
|
3514
|
-
else {
|
|
3515
|
-
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
3516
|
-
}
|
|
3517
|
-
});
|
|
3518
|
-
return {
|
|
3519
|
-
content: [{ type: "text", text: output }]
|
|
3520
|
-
};
|
|
3521
|
-
}
|
|
3522
|
-
catch (error) {
|
|
3523
|
-
return {
|
|
3524
|
-
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
3525
|
-
isError: true
|
|
3526
|
-
};
|
|
3527
|
-
}
|
|
3528
|
-
}
|
|
3529
|
-
case "generate_quality_report": {
|
|
3530
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
3531
|
-
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
3532
|
-
if (!fs.existsSync(datasetPath)) {
|
|
3533
|
-
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
3534
|
-
}
|
|
3535
|
-
try {
|
|
3536
|
-
// Optionally load text quality from metadata if available
|
|
3537
|
-
const metadata = await metadataStore.getDataset(datasetId);
|
|
3538
|
-
// TODO: Integrate text quality analysis when available
|
|
3539
|
-
const textQuality = null;
|
|
3540
|
-
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
3541
|
-
// Save report to metadata
|
|
3542
|
-
if (metadata) {
|
|
3543
|
-
metadata.unified_quality_report = report;
|
|
3544
|
-
await metadataStore.saveDataset(metadata);
|
|
3545
|
-
}
|
|
3546
|
-
let output = `# Unified Quality Report\n\n`;
|
|
3547
|
-
output += `**Dataset**: ${datasetId}\n`;
|
|
3548
|
-
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
3549
|
-
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
3550
|
-
if (report.text_quality) {
|
|
3551
|
-
output += `## Text Quality\n`;
|
|
3552
|
-
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
3553
|
-
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
3554
|
-
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
3555
|
-
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
3556
|
-
}
|
|
3557
|
-
if (report.image_quality) {
|
|
3558
|
-
output += `## Image Quality\n`;
|
|
3559
|
-
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
3560
|
-
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
3561
|
-
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
3562
|
-
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
3563
|
-
}
|
|
3564
|
-
if (report.audio_quality) {
|
|
3565
|
-
output += `## Audio Quality\n`;
|
|
3566
|
-
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
3567
|
-
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
3568
|
-
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
3569
|
-
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
3570
|
-
}
|
|
3571
|
-
if (report.video_quality) {
|
|
3572
|
-
output += `## Video Quality\n`;
|
|
3573
|
-
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
3574
|
-
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
3575
|
-
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
3576
|
-
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
3577
|
-
}
|
|
3578
|
-
output += `## Recommendations\n`;
|
|
3579
|
-
report.recommendations.forEach(rec => {
|
|
3580
|
-
output += `- ${rec}\n`;
|
|
3581
|
-
});
|
|
3582
|
-
return {
|
|
3583
|
-
content: [{ type: "text", text: output }]
|
|
3584
|
-
};
|
|
3585
|
-
}
|
|
3586
|
-
catch (error) {
|
|
3587
|
-
return {
|
|
3588
|
-
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
3589
|
-
isError: true
|
|
3590
|
-
};
|
|
3591
|
-
}
|
|
3592
|
-
}
|
|
3593
3469
|
default:
|
|
3594
3470
|
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
3595
3471
|
}
|
|
@@ -3933,6 +3809,7 @@ async function runExportCli(args) {
|
|
|
3933
3809
|
const fastMode = args.includes("--fast");
|
|
3934
3810
|
const preview = args.includes("--preview");
|
|
3935
3811
|
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
3812
|
+
const intermediateArtifacts = new Set();
|
|
3936
3813
|
const resolvedTargetDir = path.resolve(targetDir || process.cwd());
|
|
3937
3814
|
let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
|
|
3938
3815
|
if (!sourcePath) {
|
|
@@ -3954,9 +3831,16 @@ async function runExportCli(args) {
|
|
|
3954
3831
|
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
3955
3832
|
if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
|
|
3956
3833
|
try {
|
|
3834
|
+
const beforeStagingPath = sourcePath;
|
|
3957
3835
|
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
|
|
3836
|
+
if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
|
|
3837
|
+
intermediateArtifacts.add(sourcePath);
|
|
3838
|
+
}
|
|
3958
3839
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
3959
3840
|
if (pipelineResult.final_output_path) {
|
|
3841
|
+
if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
|
|
3842
|
+
intermediateArtifacts.add(pipelineResult.final_output_path);
|
|
3843
|
+
}
|
|
3960
3844
|
sourcePath = pipelineResult.final_output_path;
|
|
3961
3845
|
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3962
3846
|
upsertRegistry(datasetId, sourcePath, "completed");
|
|
@@ -3987,6 +3871,7 @@ async function runExportCli(args) {
|
|
|
3987
3871
|
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
3988
3872
|
console.error(`[Export] Output file: ${outputFile}`);
|
|
3989
3873
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3874
|
+
cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
|
|
3990
3875
|
console.log(`Export complete: ${result.output_path}`);
|
|
3991
3876
|
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
3992
3877
|
if (result.rows !== undefined)
|
package/build/python/cleaner.py
CHANGED
|
@@ -182,6 +182,8 @@ def main():
|
|
|
182
182
|
output_format = "parquet"
|
|
183
183
|
|
|
184
184
|
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
+
if base_name.endswith("_cleaned"):
|
|
186
|
+
base_name = base_name[:-8]
|
|
185
187
|
if output_format == "csv":
|
|
186
188
|
output_path = f"{base_name}_cleaned.csv"
|
|
187
189
|
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.30",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
package/src/python/cleaner.py
CHANGED
|
@@ -182,6 +182,8 @@ def main():
|
|
|
182
182
|
output_format = "parquet"
|
|
183
183
|
|
|
184
184
|
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
+
if base_name.endswith("_cleaned"):
|
|
186
|
+
base_name = base_name[:-8]
|
|
185
187
|
if output_format == "csv":
|
|
186
188
|
output_path = f"{base_name}_cleaned.csv"
|
|
187
189
|
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|