@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
package/build/index.js
ADDED
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
#! /usr/bin/env node
|
|
2
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
3
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } from "@modelcontextprotocol/sdk/types.js";
|
|
5
|
+
import { fileURLToPath } from "url";
|
|
6
|
+
import path from "path";
|
|
7
|
+
import fs from "fs";
|
|
8
|
+
import { MetadataStore } from "./metadata/store.js";
|
|
9
|
+
import { VectorStore } from "./search/vector-store.js";
|
|
10
|
+
import { Embedder } from "./search/embedder.js";
|
|
11
|
+
import { SearchEngine } from "./search/engine.js";
|
|
12
|
+
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
13
|
+
import { JobManager } from "./jobs/manager.js";
|
|
14
|
+
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
15
|
+
import { CleaningPlanner } from "./cleaning/planner.js";
|
|
16
|
+
import { DataCleaner } from "./cleaning/cleaner.js";
|
|
17
|
+
import { PipelineExecutor } from "./cleaning/executor.js";
|
|
18
|
+
import { DataSplitter } from "./splitting/splitter.js";
|
|
19
|
+
import { DataExporter } from "./export/exporter.js";
|
|
20
|
+
import { DataIngestor } from "./ingestion/ingestor.js";
|
|
21
|
+
import { InstallService } from "./install/install-service.js";
|
|
22
|
+
import { CacheService, MockRedisProvider } from "./cache/service.js";
|
|
23
|
+
import { ImageAnalyzer } from "./quality/image-analyzer.js";
|
|
24
|
+
import { MediaAnalyzer } from "./quality/media-analyzer.js";
|
|
25
|
+
import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
26
|
+
// Determine absolute paths relative to the compiled script
|
|
27
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
28
|
+
const __dirname = path.dirname(__filename);
|
|
29
|
+
// We are in /build/index.js, so project root is one level up
|
|
30
|
+
const projectRoot = path.join(__dirname, "..");
|
|
31
|
+
const dbPath = path.join(projectRoot, "data", "metadata.db");
|
|
32
|
+
const vectorPath = path.join(projectRoot, "data", "vectors.json");
|
|
33
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
34
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
35
|
+
const embedder = Embedder.getInstance();
|
|
36
|
+
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
37
|
+
const jobManager = JobManager.getInstance(metadataStore);
|
|
38
|
+
const dataIngestor = new DataIngestor(projectRoot, metadataStore);
|
|
39
|
+
const installService = new InstallService(projectRoot, metadataStore);
|
|
40
|
+
const cacheService = new CacheService(new MockRedisProvider());
|
|
41
|
+
const qualityAnalyzer = new QualityAnalyzer(cacheService, projectRoot);
|
|
42
|
+
const cleaningPlanner = new CleaningPlanner(cacheService);
|
|
43
|
+
const dataCleaner = new DataCleaner(projectRoot);
|
|
44
|
+
const pipelineExecutor = new PipelineExecutor(projectRoot);
|
|
45
|
+
const dataSplitter = new DataSplitter(projectRoot);
|
|
46
|
+
const dataExporter = new DataExporter(projectRoot);
|
|
47
|
+
const imageAnalyzer = new ImageAnalyzer(projectRoot);
|
|
48
|
+
const mediaAnalyzer = new MediaAnalyzer(projectRoot);
|
|
49
|
+
const qualityOrchestrator = new QualityOrchestrator(projectRoot);
|
|
50
|
+
// Subscribe to job updates for real-time streaming to the UI
|
|
51
|
+
jobManager.on("jobUpdated", (job) => {
|
|
52
|
+
const level = job.status === "failed" ? "error" : "info";
|
|
53
|
+
const emoji = job.status === "completed" ? "✅" : (job.status === "failed" ? "❌" : "⏳");
|
|
54
|
+
const progress = job.progress > 0 ? `[${job.progress}%]` : "";
|
|
55
|
+
server.sendLoggingMessage({
|
|
56
|
+
level,
|
|
57
|
+
data: `${emoji} [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
// Create the server
|
|
61
|
+
const server = new Server({
|
|
62
|
+
name: "vesper",
|
|
63
|
+
version: "1.0.0",
|
|
64
|
+
}, {
|
|
65
|
+
capabilities: {
|
|
66
|
+
tools: {},
|
|
67
|
+
logging: {},
|
|
68
|
+
},
|
|
69
|
+
});
|
|
70
|
+
// List Tools
|
|
71
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
72
|
+
return {
|
|
73
|
+
tools: [
|
|
74
|
+
{
|
|
75
|
+
name: "vesper_search",
|
|
76
|
+
description: "Search for datasets using natural language. Supports negative keywords (e.g., 'finance -crypto'). Returns formatted results with safety indicators, quality warnings, and source badges.",
|
|
77
|
+
inputSchema: {
|
|
78
|
+
type: "object",
|
|
79
|
+
properties: {
|
|
80
|
+
query: {
|
|
81
|
+
type: "string",
|
|
82
|
+
description: "The search query. Use -term to exclude keywords.",
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
required: ["query"],
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
name: "get_dataset_info",
|
|
90
|
+
description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
|
|
91
|
+
inputSchema: {
|
|
92
|
+
type: "object",
|
|
93
|
+
properties: {
|
|
94
|
+
dataset_id: {
|
|
95
|
+
type: "string",
|
|
96
|
+
description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace or 'kaggle:username/dataset' for Kaggle)",
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
required: ["dataset_id"],
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
name: "analyze_quality",
|
|
104
|
+
description: "Perform a deep quality check on a dataset. Returns a detailed report including duplicates, outliers, and schema issues.",
|
|
105
|
+
inputSchema: {
|
|
106
|
+
type: "object",
|
|
107
|
+
properties: {
|
|
108
|
+
dataset_id: {
|
|
109
|
+
type: "string",
|
|
110
|
+
description: "The dataset ID to analyze.",
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
required: ["dataset_id"],
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
name: "preview_cleaning",
|
|
118
|
+
description: "Dry-run of the cleaning plan. Shows what WOULD be cleaned without modifying data.",
|
|
119
|
+
inputSchema: {
|
|
120
|
+
type: "object",
|
|
121
|
+
properties: {
|
|
122
|
+
dataset_id: {
|
|
123
|
+
type: "string",
|
|
124
|
+
description: "The dataset ID to preview cleaning for.",
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
required: ["dataset_id"],
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
name: "custom_clean",
|
|
132
|
+
description: "Apply specific cleaning operations to a dataset as an asynchronous job.",
|
|
133
|
+
inputSchema: {
|
|
134
|
+
type: "object",
|
|
135
|
+
properties: {
|
|
136
|
+
dataset_id: {
|
|
137
|
+
type: "string",
|
|
138
|
+
description: "The dataset ID to clean.",
|
|
139
|
+
},
|
|
140
|
+
operations: {
|
|
141
|
+
type: "array",
|
|
142
|
+
items: {
|
|
143
|
+
type: "object",
|
|
144
|
+
properties: {
|
|
145
|
+
type: { type: "string" },
|
|
146
|
+
params: { type: "object" },
|
|
147
|
+
reason: { type: "string" },
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
},
|
|
151
|
+
},
|
|
152
|
+
required: ["dataset_id", "operations"],
|
|
153
|
+
},
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
name: "prepare_dataset",
|
|
157
|
+
description: "Full pipeline: Analyze, Clean, Split, and Export as an asynchronous job.",
|
|
158
|
+
inputSchema: {
|
|
159
|
+
type: "object",
|
|
160
|
+
properties: {
|
|
161
|
+
query: { type: "string" },
|
|
162
|
+
requirements: { type: "string" },
|
|
163
|
+
cleaning_options: { type: "object" },
|
|
164
|
+
split_config: { type: "object" },
|
|
165
|
+
},
|
|
166
|
+
required: ["query"],
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
name: "compare_datasets",
|
|
171
|
+
description: "Compare 2-3 datasets side-by-side.",
|
|
172
|
+
inputSchema: {
|
|
173
|
+
type: "object",
|
|
174
|
+
properties: {
|
|
175
|
+
dataset_ids: {
|
|
176
|
+
type: "array",
|
|
177
|
+
items: { type: "string" },
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
required: ["dataset_ids"],
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
name: "check_job_status",
|
|
185
|
+
description: "Check the status of an asynchronous job.",
|
|
186
|
+
inputSchema: {
|
|
187
|
+
type: "object",
|
|
188
|
+
properties: {
|
|
189
|
+
job_id: { type: "string" },
|
|
190
|
+
},
|
|
191
|
+
required: ["job_id"],
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
name: "export_dataset",
|
|
196
|
+
description: "Export an ingested or prepared dataset to a specific local directory.",
|
|
197
|
+
inputSchema: {
|
|
198
|
+
type: "object",
|
|
199
|
+
properties: {
|
|
200
|
+
dataset_id: {
|
|
201
|
+
type: "string",
|
|
202
|
+
description: "The unique dataset ID.",
|
|
203
|
+
},
|
|
204
|
+
target_dir: {
|
|
205
|
+
type: "string",
|
|
206
|
+
description: "Optional custom local directory for export (e.g., './naruto-quotes').",
|
|
207
|
+
},
|
|
208
|
+
format: {
|
|
209
|
+
type: "string",
|
|
210
|
+
enum: ["csv", "parquet"],
|
|
211
|
+
description: "Desired output format (default: csv).",
|
|
212
|
+
},
|
|
213
|
+
},
|
|
214
|
+
required: ["dataset_id"],
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
name: "analyze_image_quality",
|
|
219
|
+
description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
|
|
220
|
+
inputSchema: {
|
|
221
|
+
type: "object",
|
|
222
|
+
properties: {
|
|
223
|
+
path: {
|
|
224
|
+
type: "string",
|
|
225
|
+
description: "Absolute path to the image file or folder.",
|
|
226
|
+
},
|
|
227
|
+
},
|
|
228
|
+
required: ["path"],
|
|
229
|
+
},
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
name: "analyze_media_quality",
|
|
233
|
+
description: "Analyze audio/video quality (sample rate, duration, FPS, corruption) for a folder or single file.",
|
|
234
|
+
inputSchema: {
|
|
235
|
+
type: "object",
|
|
236
|
+
properties: {
|
|
237
|
+
path: {
|
|
238
|
+
type: "string",
|
|
239
|
+
description: "Absolute path to the audio/video file or folder.",
|
|
240
|
+
},
|
|
241
|
+
},
|
|
242
|
+
required: ["path"],
|
|
243
|
+
},
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
name: "generate_quality_report",
|
|
247
|
+
description: "Generate a comprehensive unified quality report for a multimodal dataset (text, image, audio, video).",
|
|
248
|
+
inputSchema: {
|
|
249
|
+
type: "object",
|
|
250
|
+
properties: {
|
|
251
|
+
dataset_id: {
|
|
252
|
+
type: "string",
|
|
253
|
+
description: "Dataset identifier.",
|
|
254
|
+
},
|
|
255
|
+
dataset_path: {
|
|
256
|
+
type: "string",
|
|
257
|
+
description: "Absolute path to the dataset directory.",
|
|
258
|
+
},
|
|
259
|
+
},
|
|
260
|
+
required: ["dataset_id", "dataset_path"],
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
],
|
|
264
|
+
};
|
|
265
|
+
});
|
|
266
|
+
// Call Tool
|
|
267
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
268
|
+
switch (request.params.name) {
|
|
269
|
+
case "vesper_search": {
|
|
270
|
+
const query = String(request.params.arguments?.query);
|
|
271
|
+
const limit = 5;
|
|
272
|
+
const safeOnly = true; // Enable safe filter by default
|
|
273
|
+
if (!query) {
|
|
274
|
+
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
275
|
+
}
|
|
276
|
+
const results = await searchEngine.search(query, { limit, safeOnly });
|
|
277
|
+
const formattedOutput = formatSearchResults(results);
|
|
278
|
+
return {
|
|
279
|
+
content: [
|
|
280
|
+
{
|
|
281
|
+
type: "text",
|
|
282
|
+
text: formattedOutput,
|
|
283
|
+
},
|
|
284
|
+
],
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
case "get_dataset_info": {
|
|
288
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
289
|
+
if (!datasetId) {
|
|
290
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
291
|
+
}
|
|
292
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
293
|
+
if (!dataset) {
|
|
294
|
+
return {
|
|
295
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
|
|
296
|
+
isError: true,
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
const formattedOutput = formatDatasetInfo(dataset);
|
|
300
|
+
return { content: [{ type: "text", text: formattedOutput }] };
|
|
301
|
+
}
|
|
302
|
+
case "analyze_quality": {
|
|
303
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
304
|
+
let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
305
|
+
// Demo Fallback for easy testing
|
|
306
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
307
|
+
const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
|
|
308
|
+
if (fs.existsSync(demoPath)) {
|
|
309
|
+
filePath = demoPath;
|
|
310
|
+
}
|
|
311
|
+
else if (datasetId !== "demo") {
|
|
312
|
+
return {
|
|
313
|
+
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
314
|
+
isError: true
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
319
|
+
return {
|
|
320
|
+
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
case "preview_cleaning": {
|
|
324
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
325
|
+
let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
326
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
327
|
+
const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
|
|
328
|
+
if (fs.existsSync(demoPath)) {
|
|
329
|
+
filePath = demoPath;
|
|
330
|
+
}
|
|
331
|
+
else {
|
|
332
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
336
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report);
|
|
337
|
+
let explanation = `### 📋 Cleaning Plan for ${datasetId}\n\n`;
|
|
338
|
+
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
339
|
+
if (plan.operations.length === 0) {
|
|
340
|
+
explanation += "✅ No cleaning operations required.";
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
plan.operations.forEach((op, i) => {
|
|
344
|
+
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
345
|
+
});
|
|
346
|
+
}
|
|
347
|
+
return {
|
|
348
|
+
content: [{ type: "text", text: explanation }]
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
case "custom_clean": {
|
|
352
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
353
|
+
const ops = request.params.arguments?.operations;
|
|
354
|
+
let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
355
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
356
|
+
const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
|
|
357
|
+
if (fs.existsSync(demoPath)) {
|
|
358
|
+
filePath = demoPath;
|
|
359
|
+
}
|
|
360
|
+
else {
|
|
361
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}`);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
365
|
+
// Run in background
|
|
366
|
+
jobManager.runJob(job.id, async (update) => {
|
|
367
|
+
update({ status_text: "Cleaning dataset..." });
|
|
368
|
+
const result = await dataCleaner.clean(filePath, ops);
|
|
369
|
+
if (!result.success)
|
|
370
|
+
throw new Error(result.error);
|
|
371
|
+
return result.output_path;
|
|
372
|
+
});
|
|
373
|
+
return {
|
|
374
|
+
content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
case "prepare_dataset": {
|
|
378
|
+
const query = String(request.params.arguments?.query);
|
|
379
|
+
const job = jobManager.createJob("prepare", 0, { query });
|
|
380
|
+
// Orchestrated Background Task
|
|
381
|
+
jobManager.runJob(job.id, async (update) => {
|
|
382
|
+
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
383
|
+
const results = await searchEngine.search(query, { limit: 1 });
|
|
384
|
+
if (results.length === 0)
|
|
385
|
+
throw new Error("No datasets found matching the query.");
|
|
386
|
+
const topDataset = results[0];
|
|
387
|
+
// Phase 6: Real Ingestion
|
|
388
|
+
update({
|
|
389
|
+
progress: 20,
|
|
390
|
+
status_text: `Matched: ${topDataset.name} (${topDataset.source})`
|
|
391
|
+
});
|
|
392
|
+
const source = topDataset.source;
|
|
393
|
+
const filePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
|
|
394
|
+
update({
|
|
395
|
+
status_text: msg,
|
|
396
|
+
progress: prog !== undefined ? 20 + Math.floor(prog * 0.3) : undefined // 20% -> 50%
|
|
397
|
+
});
|
|
398
|
+
});
|
|
399
|
+
update({ progress: 55, status_text: "Analyzing dataset quality..." });
|
|
400
|
+
const quality = await qualityAnalyzer.analyze(filePath);
|
|
401
|
+
const pipelineResult = await pipelineExecutor.runPipeline(topDataset.id, filePath, "csv", (msg) => {
|
|
402
|
+
update({ status_text: msg });
|
|
403
|
+
});
|
|
404
|
+
update({ progress: 90, status_text: "Installing dataset into codebase..." });
|
|
405
|
+
const installPath = await installService.install(topDataset.id, pipelineResult.final_output_path);
|
|
406
|
+
update({ progress: 100, status_text: "Preparation complete!" });
|
|
407
|
+
const message = `✅ Preparation complete for ${topDataset.name}.\n` +
|
|
408
|
+
`📦 Dataset installed to: ${installPath}\n` +
|
|
409
|
+
`🚀 You can now use this dataset for training your models.`;
|
|
410
|
+
return message;
|
|
411
|
+
});
|
|
412
|
+
return {
|
|
413
|
+
content: [{ type: "text", text: `Autonomous preparation job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
case "compare_datasets": {
|
|
417
|
+
const datasetIds = request.params.arguments?.dataset_ids;
|
|
418
|
+
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
419
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
420
|
+
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
421
|
+
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
422
|
+
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
423
|
+
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
424
|
+
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
425
|
+
return {
|
|
426
|
+
content: [{ type: "text", text: comparison }]
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
case "check_job_status": {
|
|
430
|
+
const jobId = String(request.params.arguments?.job_id);
|
|
431
|
+
const job = metadataStore.getJob(jobId);
|
|
432
|
+
if (!job) {
|
|
433
|
+
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
434
|
+
}
|
|
435
|
+
return {
|
|
436
|
+
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
case "export_dataset": {
|
|
440
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
441
|
+
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
|
|
442
|
+
const requestedFormat = request.params.arguments?.format || "csv";
|
|
443
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
444
|
+
if (!dataset) {
|
|
445
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
446
|
+
}
|
|
447
|
+
// Use Metadata to find the actual local file
|
|
448
|
+
const downloadStatus = metadataStore.getDownloadStatus(datasetId);
|
|
449
|
+
if (!downloadStatus || !fs.existsSync(downloadStatus.local_path)) {
|
|
450
|
+
return {
|
|
451
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId}. Please run prepare_dataset first.` }],
|
|
452
|
+
isError: true
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
let sourcePath = downloadStatus.local_path;
|
|
456
|
+
// Check if we need conversion
|
|
457
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
458
|
+
if (currentExt !== requestedFormat) {
|
|
459
|
+
console.log(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
|
|
460
|
+
try {
|
|
461
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
|
|
462
|
+
sourcePath = pipelineResult.final_output_path;
|
|
463
|
+
}
|
|
464
|
+
catch (err) {
|
|
465
|
+
return {
|
|
466
|
+
content: [{ type: "text", text: `ERROR: Failed to convert dataset to ${requestedFormat}: ${err.message}` }],
|
|
467
|
+
isError: true
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
try {
|
|
472
|
+
const finalPath = await installService.install(datasetId, sourcePath, targetDir);
|
|
473
|
+
return {
|
|
474
|
+
content: [{ type: "text", text: `✅ Dataset ${datasetId} exported to: ${finalPath}` }]
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
catch (error) {
|
|
478
|
+
return {
|
|
479
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
480
|
+
isError: true
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
case "analyze_image_quality": {
|
|
485
|
+
const inputPath = String(request.params.arguments?.path);
|
|
486
|
+
if (!fs.existsSync(inputPath)) {
|
|
487
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
488
|
+
}
|
|
489
|
+
try {
|
|
490
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
491
|
+
let output = `## 📷 Image Quality Report\n\n`;
|
|
492
|
+
output += `- **Total Images**: ${report.total_images}\n`;
|
|
493
|
+
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
494
|
+
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
495
|
+
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
496
|
+
if (report.individual_results.length > 0) {
|
|
497
|
+
output += `### 🔬 Sample Detail (Top 5)\n`;
|
|
498
|
+
report.individual_results.slice(0, 5).forEach(img => {
|
|
499
|
+
const statusEmoji = img.status === "ok" ? (img.is_blurry ? "⚠️" : "✅") : "❌";
|
|
500
|
+
output += `${statusEmoji} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
501
|
+
});
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
content: [{ type: "text", text: output }]
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
return {
|
|
509
|
+
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
510
|
+
isError: true
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
case "analyze_media_quality": {
|
|
515
|
+
const inputPath = String(request.params.arguments?.path);
|
|
516
|
+
if (!fs.existsSync(inputPath)) {
|
|
517
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
518
|
+
}
|
|
519
|
+
try {
|
|
520
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
521
|
+
let output = `## 🎬 Media Quality Report\n\n`;
|
|
522
|
+
output += `- **Total Files**: ${report.total_files}\n`;
|
|
523
|
+
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
524
|
+
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
525
|
+
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
526
|
+
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
527
|
+
}
|
|
528
|
+
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
529
|
+
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
530
|
+
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
531
|
+
}
|
|
532
|
+
output += `\n### 📊 Sample Detail (Top 5)\n`;
|
|
533
|
+
report.details.slice(0, 5).forEach(item => {
|
|
534
|
+
const statusEmoji = item.status === "ok" ? "✅" : "❌";
|
|
535
|
+
if (item.type === "audio" && 'sample_rate' in item) {
|
|
536
|
+
output += `${statusEmoji} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
537
|
+
}
|
|
538
|
+
else if (item.type === "video" && 'width' in item) {
|
|
539
|
+
output += `${statusEmoji} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
540
|
+
}
|
|
541
|
+
else {
|
|
542
|
+
output += `${statusEmoji} **${item.filename}**: ${item.error}\n`;
|
|
543
|
+
}
|
|
544
|
+
});
|
|
545
|
+
return {
|
|
546
|
+
content: [{ type: "text", text: output }]
|
|
547
|
+
};
|
|
548
|
+
}
|
|
549
|
+
catch (error) {
|
|
550
|
+
return {
|
|
551
|
+
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
552
|
+
isError: true
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
case "generate_quality_report": {
|
|
557
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
558
|
+
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
559
|
+
if (!fs.existsSync(datasetPath)) {
|
|
560
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
561
|
+
}
|
|
562
|
+
try {
|
|
563
|
+
// Optionally load text quality from metadata if available
|
|
564
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
565
|
+
// TODO: Integrate text quality analysis when available
|
|
566
|
+
const textQuality = null;
|
|
567
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
568
|
+
// Save report to metadata
|
|
569
|
+
if (metadata) {
|
|
570
|
+
metadata.unified_quality_report = report;
|
|
571
|
+
await metadataStore.saveDataset(metadata);
|
|
572
|
+
}
|
|
573
|
+
let output = `# 📊 Unified Quality Report\n\n`;
|
|
574
|
+
output += `**Dataset**: ${datasetId}\n`;
|
|
575
|
+
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
576
|
+
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
577
|
+
if (report.text_quality) {
|
|
578
|
+
output += `## 📝 Text Quality\n`;
|
|
579
|
+
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
580
|
+
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
581
|
+
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
582
|
+
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
583
|
+
}
|
|
584
|
+
if (report.image_quality) {
|
|
585
|
+
output += `## 🖼️ Image Quality\n`;
|
|
586
|
+
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
587
|
+
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
588
|
+
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
589
|
+
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
590
|
+
}
|
|
591
|
+
if (report.audio_quality) {
|
|
592
|
+
output += `## 🎵 Audio Quality\n`;
|
|
593
|
+
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
594
|
+
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
595
|
+
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
596
|
+
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
597
|
+
}
|
|
598
|
+
if (report.video_quality) {
|
|
599
|
+
output += `## 🎬 Video Quality\n`;
|
|
600
|
+
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
601
|
+
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
602
|
+
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
603
|
+
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
604
|
+
}
|
|
605
|
+
output += `## 💡 Recommendations\n`;
|
|
606
|
+
report.recommendations.forEach(rec => {
|
|
607
|
+
output += `- ${rec}\n`;
|
|
608
|
+
});
|
|
609
|
+
return {
|
|
610
|
+
content: [{ type: "text", text: output }]
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
catch (error) {
|
|
614
|
+
return {
|
|
615
|
+
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
616
|
+
isError: true
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
default:
|
|
621
|
+
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
622
|
+
}
|
|
623
|
+
});
|
|
624
|
+
async function main() {
|
|
625
|
+
const transport = new StdioServerTransport();
|
|
626
|
+
await server.connect(transport);
|
|
627
|
+
console.error("Vesper MCP server running on stdio");
|
|
628
|
+
}
|
|
629
|
+
main().catch((error) => {
|
|
630
|
+
console.error("Server error:", error);
|
|
631
|
+
process.exit(1);
|
|
632
|
+
});
|