@vespermcp/mcp-server 1.0.5 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +215 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +273 -92
- package/build/ingestion/ingestor.js +5 -22
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +28 -2
- package/build/quality/media-analyzer.js +28 -2
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +15 -6
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
|
@@ -1,12 +1,44 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
3
4
|
export class QualityAnalyzer {
|
|
4
5
|
cache;
|
|
5
6
|
pythonPath = "python"; // Assumes python is in PATH
|
|
6
7
|
scriptPath;
|
|
7
|
-
constructor(cache,
|
|
8
|
+
constructor(cache, buildDir = process.cwd()) {
|
|
9
|
+
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
10
|
+
// Priority:
|
|
11
|
+
// 1. ~/.vesper/python (stable synced location)
|
|
12
|
+
// 2. build/python (production)
|
|
13
|
+
// 3. src/python (development)
|
|
8
14
|
this.cache = cache;
|
|
9
|
-
|
|
15
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
16
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
17
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "quality_engine.py");
|
|
18
|
+
const scriptPath1 = path.resolve(buildDir, "python", "quality_engine.py");
|
|
19
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "quality_engine.py");
|
|
20
|
+
const scriptPath3 = path.resolve(buildDir, "..", "python", "quality_engine.py");
|
|
21
|
+
if (fs.existsSync(scriptPath0)) {
|
|
22
|
+
this.scriptPath = scriptPath0;
|
|
23
|
+
}
|
|
24
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
25
|
+
this.scriptPath = scriptPath1;
|
|
26
|
+
}
|
|
27
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
28
|
+
this.scriptPath = scriptPath2;
|
|
29
|
+
}
|
|
30
|
+
else if (fs.existsSync(scriptPath3)) {
|
|
31
|
+
this.scriptPath = scriptPath3;
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
// Fallback to stable data path, error will be caught during execution
|
|
35
|
+
this.scriptPath = scriptPath0;
|
|
36
|
+
console.error(`[QualityAnalyzer] WARNING: Python script not found!`);
|
|
37
|
+
}
|
|
38
|
+
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
39
|
+
if (process.platform === "win32") {
|
|
40
|
+
this.pythonPath = "py";
|
|
41
|
+
}
|
|
10
42
|
}
|
|
11
43
|
/**
|
|
12
44
|
* Run quality analysis on a local file (CSV/Parquet/JSON)
|
|
@@ -16,7 +48,7 @@ export class QualityAnalyzer {
|
|
|
16
48
|
if (this.cache && datasetId) {
|
|
17
49
|
const cached = await this.cache.getReport(datasetId);
|
|
18
50
|
if (cached) {
|
|
19
|
-
console.
|
|
51
|
+
console.error(`[QualityAnalyzer] Cache hit for ${datasetId}`);
|
|
20
52
|
return cached;
|
|
21
53
|
}
|
|
22
54
|
}
|
|
@@ -32,7 +64,11 @@ export class QualityAnalyzer {
|
|
|
32
64
|
});
|
|
33
65
|
process.on("close", (code) => {
|
|
34
66
|
if (code !== 0) {
|
|
35
|
-
|
|
67
|
+
const errorDetails = `Quality Analyzer failed (code ${code})
|
|
68
|
+
Command: ${this.pythonPath} ${this.scriptPath} ${filePath}
|
|
69
|
+
Script path exists: ${fs.existsSync(this.scriptPath)}
|
|
70
|
+
Error output: ${stderr}`;
|
|
71
|
+
reject(new Error(errorDetails));
|
|
36
72
|
return;
|
|
37
73
|
}
|
|
38
74
|
try {
|
|
@@ -1,10 +1,36 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
3
4
|
export class ImageAnalyzer {
|
|
4
5
|
pythonPath = "python";
|
|
5
6
|
scriptPath;
|
|
6
|
-
constructor(
|
|
7
|
-
|
|
7
|
+
constructor(buildDir = process.cwd()) {
|
|
8
|
+
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
9
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "image_engine.py");
|
|
12
|
+
const scriptPath1 = path.resolve(buildDir, "python", "image_engine.py");
|
|
13
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "image_engine.py");
|
|
14
|
+
const scriptPath3 = path.resolve(buildDir, "..", "python", "image_engine.py");
|
|
15
|
+
if (fs.existsSync(scriptPath0)) {
|
|
16
|
+
this.scriptPath = scriptPath0;
|
|
17
|
+
}
|
|
18
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
+
this.scriptPath = scriptPath1;
|
|
20
|
+
}
|
|
21
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
+
this.scriptPath = scriptPath2;
|
|
23
|
+
}
|
|
24
|
+
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
+
this.scriptPath = scriptPath3;
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
this.scriptPath = scriptPath0;
|
|
29
|
+
}
|
|
30
|
+
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
31
|
+
if (process.platform === "win32") {
|
|
32
|
+
this.pythonPath = "py";
|
|
33
|
+
}
|
|
8
34
|
}
|
|
9
35
|
/**
|
|
10
36
|
* Analyze image quality for a single file or a directory
|
|
@@ -1,10 +1,36 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
3
4
|
export class MediaAnalyzer {
|
|
4
5
|
pythonPath = "python";
|
|
5
6
|
scriptPath;
|
|
6
|
-
constructor(
|
|
7
|
-
|
|
7
|
+
constructor(buildDir = process.cwd()) {
|
|
8
|
+
// buildDir is the directory containing the compiled JS (e.g., build/)
|
|
9
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "media_engine.py");
|
|
12
|
+
const scriptPath1 = path.resolve(buildDir, "python", "media_engine.py");
|
|
13
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "media_engine.py");
|
|
14
|
+
const scriptPath3 = path.resolve(buildDir, "..", "python", "media_engine.py");
|
|
15
|
+
if (fs.existsSync(scriptPath0)) {
|
|
16
|
+
this.scriptPath = scriptPath0;
|
|
17
|
+
}
|
|
18
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
+
this.scriptPath = scriptPath1;
|
|
20
|
+
}
|
|
21
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
+
this.scriptPath = scriptPath2;
|
|
23
|
+
}
|
|
24
|
+
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
+
this.scriptPath = scriptPath3;
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
this.scriptPath = scriptPath0;
|
|
29
|
+
}
|
|
30
|
+
// Detect Python command (Windows may use 'py' instead of 'python')
|
|
31
|
+
if (process.platform === "win32") {
|
|
32
|
+
this.pythonPath = "py";
|
|
33
|
+
}
|
|
8
34
|
}
|
|
9
35
|
/**
|
|
10
36
|
* Analyze audio/video quality for a single file or a directory
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Cleanup script to remove all Kaggle datasets from Vesper
|
|
4
|
+
*/
|
|
5
|
+
import { fileURLToPath } from "url";
|
|
6
|
+
import path from "path";
|
|
7
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
8
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
9
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
10
|
+
const __dirname = path.dirname(__filename);
|
|
11
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || path.join(__dirname, "..");
|
|
12
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
13
|
+
const dbPath = path.join(dataRoot, "data", "metadata.db");
|
|
14
|
+
const vectorPath = path.join(dataRoot, "data", "vectors.json");
|
|
15
|
+
console.log("🧹 Vesper Kaggle Cleanup");
|
|
16
|
+
console.log("========================\n");
|
|
17
|
+
try {
|
|
18
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
19
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
20
|
+
// Get all Kaggle dataset IDs
|
|
21
|
+
const kaggleIds = metadataStore.getDatasetIdsBySource("kaggle");
|
|
22
|
+
console.log(`Found ${kaggleIds.length} Kaggle datasets in database`);
|
|
23
|
+
if (kaggleIds.length === 0) {
|
|
24
|
+
console.log("✅ No Kaggle datasets to remove");
|
|
25
|
+
process.exit(0);
|
|
26
|
+
}
|
|
27
|
+
// Delete from vector store
|
|
28
|
+
const vectorsDeleted = vectorStore.deleteMany(kaggleIds);
|
|
29
|
+
console.log(`🗑️ Deleted ${vectorsDeleted} vectors from vector store`);
|
|
30
|
+
vectorStore.save();
|
|
31
|
+
// Delete from metadata database
|
|
32
|
+
const datasetsDeleted = metadataStore.deleteBySource("kaggle");
|
|
33
|
+
console.log(`🗑️ Deleted ${datasetsDeleted} datasets from metadata database`);
|
|
34
|
+
metadataStore.close();
|
|
35
|
+
console.log("\n✅ Cleanup complete! Kaggle datasets have been removed.");
|
|
36
|
+
console.log(" You can now search without seeing Kaggle results.");
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
console.error("❌ Cleanup failed:", error.message);
|
|
40
|
+
process.exit(1);
|
|
41
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { JobManager } from "../jobs/manager.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
async function repro() {
|
|
5
|
+
const dbPath = "repro_test.db";
|
|
6
|
+
if (fs.existsSync(dbPath))
|
|
7
|
+
fs.unlinkSync(dbPath);
|
|
8
|
+
const store = new MetadataStore(dbPath);
|
|
9
|
+
const jobManager = JobManager.getInstance(store);
|
|
10
|
+
console.log("Setting up listener...");
|
|
11
|
+
jobManager.on("processJob", async (job, execute) => {
|
|
12
|
+
console.log(`Listener received job ${job.id}`);
|
|
13
|
+
const task = async () => {
|
|
14
|
+
console.log("Running task...");
|
|
15
|
+
return "success";
|
|
16
|
+
};
|
|
17
|
+
try {
|
|
18
|
+
await execute(task);
|
|
19
|
+
console.log("Execute finished");
|
|
20
|
+
}
|
|
21
|
+
catch (e) {
|
|
22
|
+
console.error("Execute failed in listener:", e.message);
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
console.log("Creating job...");
|
|
26
|
+
const job = jobManager.createJob("prepare", 0, { query: "test" });
|
|
27
|
+
console.log(`Job created: ${job.id}`);
|
|
28
|
+
// Wait for a bit
|
|
29
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
30
|
+
const finalJob = store.getJob(job.id);
|
|
31
|
+
console.log("Final job status:", finalJob?.status);
|
|
32
|
+
console.log("Final job status text:", finalJob?.status_text);
|
|
33
|
+
store.close();
|
|
34
|
+
if (fs.existsSync(dbPath))
|
|
35
|
+
fs.unlinkSync(dbPath);
|
|
36
|
+
}
|
|
37
|
+
repro().catch(console.error);
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import { spawnSync } from "child_process";
|
|
4
|
+
const pythonPath = "python";
|
|
5
|
+
const scriptPath = path.join(process.cwd(), "src", "python", "cleaner.py");
|
|
6
|
+
const testDir = path.join(process.cwd(), "test_repro");
|
|
7
|
+
if (!fs.existsSync(testDir))
|
|
8
|
+
fs.mkdirSync(testDir);
|
|
9
|
+
async function runRepro() {
|
|
10
|
+
console.log("=== Reproducing CSV Export Bug ===\n");
|
|
11
|
+
const parquetFile = path.join(testDir, "test_nested.parquet");
|
|
12
|
+
const csvOutput = path.join(testDir, "test_nested_cleaned.csv");
|
|
13
|
+
// 1. Create a Parquet file with nested data (Lists/Structs) using Python
|
|
14
|
+
console.log("Creating nested Parquet file...");
|
|
15
|
+
const createScript = `
|
|
16
|
+
import polars as pl
|
|
17
|
+
df = pl.DataFrame({
|
|
18
|
+
"id": [1, 2, 3],
|
|
19
|
+
"tags": [["a", "b"], ["c"], []],
|
|
20
|
+
"meta": [{"score": 0.9, "safe": True}, {"score": 0.4, "safe": False}, {"score": 0.1, "safe": True}]
|
|
21
|
+
})
|
|
22
|
+
df.write_parquet(r"${parquetFile}")
|
|
23
|
+
`;
|
|
24
|
+
fs.writeFileSync(path.join(testDir, "create_data.py"), createScript);
|
|
25
|
+
spawnSync(pythonPath, [path.join(testDir, "create_data.py")], { stdio: 'inherit' });
|
|
26
|
+
// 2. Call cleaner.py to convert to CSV
|
|
27
|
+
console.log("Calling cleaner.py to convert to CSV...");
|
|
28
|
+
const result = spawnSync(pythonPath, [
|
|
29
|
+
scriptPath,
|
|
30
|
+
parquetFile,
|
|
31
|
+
"[]",
|
|
32
|
+
"csv"
|
|
33
|
+
]);
|
|
34
|
+
console.log("Exit Code:", result.status);
|
|
35
|
+
console.log("Stdout:", result.stdout?.toString());
|
|
36
|
+
console.log("Stderr:", result.stderr?.toString());
|
|
37
|
+
if (result.status === 0) {
|
|
38
|
+
try {
|
|
39
|
+
const data = JSON.parse(result.stdout.toString());
|
|
40
|
+
if (data.success) {
|
|
41
|
+
console.log("SUCCESS! Output file:", data.output_path);
|
|
42
|
+
if (fs.existsSync(data.output_path)) {
|
|
43
|
+
console.log("File exists on disk.");
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
console.error("cleaner.py reported failure:", data.error);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
catch (e) {
|
|
51
|
+
console.error("Failed to parse JSON output:", e);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
runRepro().catch(console.error);
|
|
56
|
+
runRepro().catch(console.error);
|
|
@@ -47,16 +47,18 @@ async function testPhase5Tools() {
|
|
|
47
47
|
// Create job (Logic from index.ts)
|
|
48
48
|
const job = jobManager.createJob("prepare", 0, { query });
|
|
49
49
|
console.log(` - Job Created: ${job.id}`);
|
|
50
|
-
//
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
await
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
50
|
+
// Register listener for simulated work (Logic from index.ts)
|
|
51
|
+
jobManager.on("processJob", async (currJob, execute) => {
|
|
52
|
+
if (currJob.id !== job.id)
|
|
53
|
+
return;
|
|
54
|
+
await execute(async () => {
|
|
55
|
+
console.log(" - [Worker] Starting autonomous preparation task...");
|
|
56
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
57
|
+
console.log(" - [Worker] Phase 1: Search complete");
|
|
58
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
59
|
+
console.log(" - [Worker] Phase 2: Quality analysis complete");
|
|
60
|
+
return "data/exports/prepared_dataset.parquet";
|
|
61
|
+
});
|
|
60
62
|
});
|
|
61
63
|
// 4. Test check_job_status (Polling)
|
|
62
64
|
console.log("\nStep 4: Polling Job Status (Simulating UI Check)...");
|
|
@@ -67,7 +69,6 @@ async function testPhase5Tools() {
|
|
|
67
69
|
break;
|
|
68
70
|
await new Promise(r => setTimeout(r, 800));
|
|
69
71
|
}
|
|
70
|
-
await jobPromise;
|
|
71
72
|
console.log("\n Phase 5 tools logic verified.");
|
|
72
73
|
}
|
|
73
74
|
testPhase5Tools().catch(console.error);
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { JobManager } from "../jobs/manager.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
async function testSync() {
|
|
5
|
+
console.log("Starting Production Sync Test...");
|
|
6
|
+
const dbPath = "prod_sync.db";
|
|
7
|
+
if (fs.existsSync(dbPath))
|
|
8
|
+
fs.unlinkSync(dbPath);
|
|
9
|
+
const store = new MetadataStore(dbPath);
|
|
10
|
+
const jobManager = JobManager.getInstance(store);
|
|
11
|
+
console.log("Attaching listener (same as index.ts)...");
|
|
12
|
+
jobManager.on("processJob", async (job, execute) => {
|
|
13
|
+
console.log(`[Listener 1] Received job ${job.id}`);
|
|
14
|
+
if (typeof execute !== 'function') {
|
|
15
|
+
console.error(`[Listener 1] ERROR: execute is not a function! It is: ${typeof execute}`);
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
18
|
+
const task = async () => {
|
|
19
|
+
console.log("[Listener 1] Task running...");
|
|
20
|
+
return "ok";
|
|
21
|
+
};
|
|
22
|
+
await execute(task);
|
|
23
|
+
console.log("[Listener 1] Task finished.");
|
|
24
|
+
});
|
|
25
|
+
console.log("Emitting job...");
|
|
26
|
+
const job = jobManager.createJob("prepare", 0, { query: "test" });
|
|
27
|
+
// Wait for the background loop
|
|
28
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
29
|
+
const finalJob = store.getJob(job.id);
|
|
30
|
+
console.log(`Job Result: ${finalJob?.status} - ${finalJob?.status_text}`);
|
|
31
|
+
store.close();
|
|
32
|
+
if (fs.existsSync(dbPath))
|
|
33
|
+
fs.unlinkSync(dbPath);
|
|
34
|
+
console.log("Test Complete.");
|
|
35
|
+
}
|
|
36
|
+
testSync().catch(console.error);
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { TargetDetector } from "../preparation/target-detector.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
async function testDetector() {
|
|
4
|
+
// 1. Test existing build dir
|
|
5
|
+
const detector = new TargetDetector(path.join(process.cwd(), "build"));
|
|
6
|
+
// 2. Create a dummy CSV for testing
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_target.csv");
|
|
8
|
+
const fs = (await import("fs")).default;
|
|
9
|
+
// Test Case 1: SalePrice (Regression)
|
|
10
|
+
console.log("--- Test Case 1: SalePrice ---");
|
|
11
|
+
fs.writeFileSync(testFile, "id,feature1,feature2,SalePrice\n1,10,20,100000\n2,11,21,120000\n3,12,22,110000");
|
|
12
|
+
let result = await detector.detectTarget(testFile);
|
|
13
|
+
console.log("Detection:", result.target_column, result.confidence);
|
|
14
|
+
if (result.target_column) {
|
|
15
|
+
let val = await detector.validateTarget(testFile, result.target_column);
|
|
16
|
+
console.log("Validation:", val.problem_type, val.valid);
|
|
17
|
+
}
|
|
18
|
+
// Test Case 2: diagnosis (Classification)
|
|
19
|
+
console.log("\n--- Test Case 2: diagnosis ---");
|
|
20
|
+
fs.writeFileSync(testFile, "id,age,diagnosis\n1,50,M\n2,60,B\n3,45,M");
|
|
21
|
+
result = await detector.detectTarget(testFile);
|
|
22
|
+
console.log("Detection:", result.target_column, result.confidence);
|
|
23
|
+
if (result.target_column) {
|
|
24
|
+
let val = await detector.validateTarget(testFile, result.target_column);
|
|
25
|
+
console.log("Validation:", val.problem_type, val.valid);
|
|
26
|
+
}
|
|
27
|
+
fs.unlinkSync(testFile);
|
|
28
|
+
}
|
|
29
|
+
testDetector().catch(console.error);
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
const target = "C:\\Users\\нурбулан\\AppData\\Roaming\\Code\\User\\mcp.json";
|
|
3
|
+
const content = JSON.stringify({ mcpServers: { test: { command: "node" } } }, null, 2);
|
|
4
|
+
try {
|
|
5
|
+
console.log(`Testing write to: ${target}`);
|
|
6
|
+
fs.writeFileSync(target, content, "utf8");
|
|
7
|
+
const stat = fs.statSync(target);
|
|
8
|
+
console.log(`Success! File size: ${stat.size} bytes`);
|
|
9
|
+
const readBack = fs.readFileSync(target, "utf8");
|
|
10
|
+
console.log("Read back content:", readBack);
|
|
11
|
+
}
|
|
12
|
+
catch (e) {
|
|
13
|
+
console.error("Failed to write:", e);
|
|
14
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { TargetDetector } from "../preparation/target-detector.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
async function verifyIntegration() {
|
|
5
|
+
// 1. Create a dummy CSV with a clear target
|
|
6
|
+
const testFile = path.join(process.cwd(), "data", "raw", "integration_test.csv");
|
|
7
|
+
const testId = "integration_test";
|
|
8
|
+
if (!fs.existsSync(path.dirname(testFile))) {
|
|
9
|
+
fs.mkdirSync(path.dirname(testFile), { recursive: true });
|
|
10
|
+
}
|
|
11
|
+
console.log("Creating test file:", testFile);
|
|
12
|
+
fs.writeFileSync(testFile, "id,feature1,feature2,SalePrice\n1,10,20,100000\n2,11,21,100000\n3,12,22,110000");
|
|
13
|
+
// 2. Call the preview_cleaning tool (simulated by calling valid request handler logic or via MCP client if possible)
|
|
14
|
+
// Since we can't easily call the MCP server from here without a client, we will simulate
|
|
15
|
+
// the logic we added to index.ts to ensure it runs without error.
|
|
16
|
+
try {
|
|
17
|
+
const { QualityAnalyzer } = await import("../quality/analyzer.js");
|
|
18
|
+
const { CleaningPlanner } = await import("../cleaning/planner.js");
|
|
19
|
+
const { CacheService, MockRedisProvider } = await import("../cache/service.js");
|
|
20
|
+
console.log("Initializing services...");
|
|
21
|
+
const cacheService = new CacheService(new MockRedisProvider());
|
|
22
|
+
// Use build/ directory to simulate runtime environment
|
|
23
|
+
const buildDir = path.join(process.cwd(), "build");
|
|
24
|
+
const qualityAnalyzer = new QualityAnalyzer(cacheService, buildDir);
|
|
25
|
+
const cleaningPlanner = new CleaningPlanner(cacheService, buildDir);
|
|
26
|
+
console.log("Running analysis...");
|
|
27
|
+
const report = await qualityAnalyzer.analyze(testFile);
|
|
28
|
+
console.log("Running target detection...");
|
|
29
|
+
const detector = new TargetDetector(buildDir);
|
|
30
|
+
const targetResult = await detector.detectTarget(testFile);
|
|
31
|
+
console.log("Detected:", targetResult);
|
|
32
|
+
const targetInfo = targetResult.target_column ? {
|
|
33
|
+
target: targetResult.target_column,
|
|
34
|
+
confidence: targetResult.confidence
|
|
35
|
+
} : undefined;
|
|
36
|
+
console.log("Generating plan...");
|
|
37
|
+
const plan = await cleaningPlanner.generatePlan(testId, report, undefined, targetInfo);
|
|
38
|
+
console.log("Plan Operations:", JSON.stringify(plan.operations, null, 2));
|
|
39
|
+
const hasRename = plan.operations.some(op => op.type === "RenameTarget");
|
|
40
|
+
if (hasRename) {
|
|
41
|
+
console.log("✅ SUCCESS: RenameTarget operation found in plan!");
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
console.error("❌ FAILURE: RenameTarget operation NOT found.");
|
|
45
|
+
process.exit(1);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
catch (e) {
|
|
49
|
+
console.error("Error during verification:", e);
|
|
50
|
+
process.exit(1);
|
|
51
|
+
}
|
|
52
|
+
finally {
|
|
53
|
+
if (fs.existsSync(testFile))
|
|
54
|
+
fs.unlinkSync(testFile);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
verifyIntegration();
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
+
import { Embedder } from "../search/embedder.js";
|
|
5
|
+
import { SearchEngine } from "../search/engine.js";
|
|
6
|
+
import { formatSearchResults } from "../tools/formatter.js";
|
|
7
|
+
import fs from "fs";
|
|
8
|
+
const query = process.argv[2] || "anime";
|
|
9
|
+
// Use the actual .vesper data path if it exists, otherwise use local data/
|
|
10
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || process.cwd();
|
|
11
|
+
const vesperDataRoot = path.join(homeDir, ".vesper");
|
|
12
|
+
let dbPath = path.join(vesperDataRoot, "data", "metadata.db");
|
|
13
|
+
let vectorPath = path.join(vesperDataRoot, "data", "vectors.json");
|
|
14
|
+
if (!fs.existsSync(dbPath)) {
|
|
15
|
+
console.error("Using local project data directory as fallback...");
|
|
16
|
+
dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
17
|
+
vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
18
|
+
}
|
|
19
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
20
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
21
|
+
const embedder = Embedder.getInstance();
|
|
22
|
+
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
23
|
+
async function run() {
|
|
24
|
+
console.log(`\n=== VERIFYING SOURCE PRIORITIZATION [Query: "${query}"] ===\n`);
|
|
25
|
+
const results = await searchEngine.search(query, { limit: 5 });
|
|
26
|
+
if (results.length === 0) {
|
|
27
|
+
console.log("No results found. Run a search that triggers JIT first!");
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
// Print formatted results to show badges
|
|
31
|
+
console.log(formatSearchResults(results));
|
|
32
|
+
}
|
|
33
|
+
run().catch(console.error);
|
package/build/search/engine.js
CHANGED
|
@@ -96,12 +96,23 @@ export class SearchEngine {
|
|
|
96
96
|
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
97
97
|
penalty += 0.2;
|
|
98
98
|
}
|
|
99
|
+
// D. Accessibility Bonuses (Prioritize low-friction sources)
|
|
100
|
+
let bonus = 0;
|
|
101
|
+
const sourceBonuses = {
|
|
102
|
+
"huggingface": 0.1,
|
|
103
|
+
"uci": 0.1,
|
|
104
|
+
"github": 0.1,
|
|
105
|
+
"worldbank": 0.1,
|
|
106
|
+
"nasa": 0.1
|
|
107
|
+
};
|
|
108
|
+
bonus = sourceBonuses[metadata.source] || 0;
|
|
99
109
|
// Final Combined Score
|
|
100
|
-
// 70% Vector, 30% Lexical, minus Penalties
|
|
101
|
-
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty;
|
|
110
|
+
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
111
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus;
|
|
102
112
|
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
103
113
|
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
104
114
|
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
115
|
+
metadata.accessibility_bonus = bonus;
|
|
105
116
|
results.push(metadata);
|
|
106
117
|
}
|
|
107
118
|
// Sort by final score and limit
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
-
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
4
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
5
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
@@ -45,7 +44,7 @@ export class JITOrchestrator {
|
|
|
45
44
|
// Get existing dataset IDs to avoid duplicates
|
|
46
45
|
const existing = this.metadataStore.getAllDatasets();
|
|
47
46
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
48
|
-
// 1. Scrape HuggingFace
|
|
47
|
+
// 1. Scrape HuggingFace (Open Access)
|
|
49
48
|
const hfResults = await this.scrapeHuggingFace(query, limit);
|
|
50
49
|
console.error(` HuggingFace: Found ${hfResults.length} datasets`);
|
|
51
50
|
for (const ds of hfResults) {
|
|
@@ -54,21 +53,7 @@ export class JITOrchestrator {
|
|
|
54
53
|
existingIds.add(ds.id);
|
|
55
54
|
}
|
|
56
55
|
}
|
|
57
|
-
// 2. Scrape
|
|
58
|
-
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
59
|
-
const kaggleKey = process.env.KAGGLE_KEY;
|
|
60
|
-
if (kaggleUser && kaggleKey) {
|
|
61
|
-
const kaggleResults = await this.scrapeKaggle(query, Math.floor(limit / 2));
|
|
62
|
-
console.error(` Kaggle: Found ${kaggleResults.length} datasets`);
|
|
63
|
-
for (const ds of kaggleResults) {
|
|
64
|
-
ds.id = `kaggle:${ds.id}`;
|
|
65
|
-
if (!existingIds.has(ds.id)) {
|
|
66
|
-
newDatasets.push(ds);
|
|
67
|
-
existingIds.add(ds.id);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
// 3. Scrape UCI
|
|
56
|
+
// 2. Scrape UCI (Open Access)
|
|
72
57
|
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
73
58
|
console.error(` UCI: Found ${uciResults.length} datasets`);
|
|
74
59
|
for (const ds of uciResults) {
|
|
@@ -77,7 +62,7 @@ export class JITOrchestrator {
|
|
|
77
62
|
existingIds.add(ds.id);
|
|
78
63
|
}
|
|
79
64
|
}
|
|
80
|
-
//
|
|
65
|
+
// 3. Scrape GitHub (Open Access)
|
|
81
66
|
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
82
67
|
console.error(` GitHub: Found ${githubResults.length} datasets`);
|
|
83
68
|
for (const ds of githubResults) {
|
|
@@ -86,7 +71,7 @@ export class JITOrchestrator {
|
|
|
86
71
|
existingIds.add(ds.id);
|
|
87
72
|
}
|
|
88
73
|
}
|
|
89
|
-
//
|
|
74
|
+
// 4. Scrape World Bank (Open Access)
|
|
90
75
|
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
91
76
|
console.error(` World Bank: Found ${wbResults.length} datasets`);
|
|
92
77
|
for (const ds of wbResults) {
|
|
@@ -95,7 +80,7 @@ export class JITOrchestrator {
|
|
|
95
80
|
existingIds.add(ds.id);
|
|
96
81
|
}
|
|
97
82
|
}
|
|
98
|
-
//
|
|
83
|
+
// 5. Scrape NASA (Open Access)
|
|
99
84
|
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
100
85
|
console.error(` NASA: Found ${nasaResults.length} datasets`);
|
|
101
86
|
for (const ds of nasaResults) {
|
|
@@ -125,8 +110,7 @@ export class JITOrchestrator {
|
|
|
125
110
|
async scrapeHuggingFace(query, limit) {
|
|
126
111
|
const scraper = new HuggingFaceScraper();
|
|
127
112
|
try {
|
|
128
|
-
//
|
|
129
|
-
// In the future, we can add a freeTextSearch parameter to the scraper
|
|
113
|
+
// Pass the query as a general search term
|
|
130
114
|
return await scraper.scrape(limit, true, query);
|
|
131
115
|
}
|
|
132
116
|
catch (error) {
|
|
@@ -134,24 +118,6 @@ export class JITOrchestrator {
|
|
|
134
118
|
return [];
|
|
135
119
|
}
|
|
136
120
|
}
|
|
137
|
-
/**
|
|
138
|
-
* Scrape Kaggle with search query
|
|
139
|
-
*/
|
|
140
|
-
async scrapeKaggle(query, limit) {
|
|
141
|
-
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
142
|
-
const kaggleKey = process.env.KAGGLE_KEY;
|
|
143
|
-
if (!kaggleUser || !kaggleKey) {
|
|
144
|
-
return [];
|
|
145
|
-
}
|
|
146
|
-
try {
|
|
147
|
-
const scraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
148
|
-
return await scraper.scrape(query, limit);
|
|
149
|
-
}
|
|
150
|
-
catch (error) {
|
|
151
|
-
console.error(` ERROR: Kaggle scrape failed: ${error.message}`);
|
|
152
|
-
return [];
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
121
|
/**
|
|
156
122
|
* Scrape UCI
|
|
157
123
|
*/
|
|
@@ -74,6 +74,24 @@ export class VectorStore {
|
|
|
74
74
|
add(id, vector) {
|
|
75
75
|
this.idToVector.set(id, vector instanceof Float32Array ? vector : new Float32Array(vector));
|
|
76
76
|
}
|
|
77
|
+
/**
|
|
78
|
+
* Delete a vector by ID
|
|
79
|
+
*/
|
|
80
|
+
delete(id) {
|
|
81
|
+
return this.idToVector.delete(id);
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Delete multiple vectors by IDs
|
|
85
|
+
*/
|
|
86
|
+
deleteMany(ids) {
|
|
87
|
+
let count = 0;
|
|
88
|
+
for (const id of ids) {
|
|
89
|
+
if (this.idToVector.delete(id)) {
|
|
90
|
+
count++;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return count;
|
|
94
|
+
}
|
|
77
95
|
search(queryVector, limit = 10) {
|
|
78
96
|
const q = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
|
|
79
97
|
const results = [];
|