@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { JobManager } from "../jobs/manager.js";
|
|
3
|
+
import { ObservabilityService, MockErrorTracker } from "../monitoring/observability.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Initializing Enhanced Observability Test ---");
|
|
7
|
+
const dbPath = path.resolve("data", "test-obs-10.db");
|
|
8
|
+
const store = new MetadataStore(dbPath);
|
|
9
|
+
const errorTracker = new MockErrorTracker();
|
|
10
|
+
const stats = new ObservabilityService(errorTracker);
|
|
11
|
+
const manager = JobManager.getInstance(store, stats);
|
|
12
|
+
manager.setConcurrency(5); // Increased for faster test
|
|
13
|
+
const jobsToRun = 10;
|
|
14
|
+
// Listener to simulate job processing
|
|
15
|
+
manager.on("processJob", async (job, run) => {
|
|
16
|
+
// Fix: Parse JSON metadata correctly
|
|
17
|
+
const meta = job.metadata ? JSON.parse(job.metadata) : null;
|
|
18
|
+
const isFailing = meta === "FAIL";
|
|
19
|
+
const isSlow = meta === "SLOW";
|
|
20
|
+
await run(async () => {
|
|
21
|
+
// Simulated delay: 100ms for fast, 800ms for slow
|
|
22
|
+
const delay = isSlow ? 800 : 100;
|
|
23
|
+
await new Promise(r => setTimeout(r, delay));
|
|
24
|
+
if (isFailing) {
|
|
25
|
+
console.log(`[Worker] Failing Job: ${job.id}`);
|
|
26
|
+
throw new Error("Simulated job failure for metrics tracking");
|
|
27
|
+
}
|
|
28
|
+
console.log(`[Worker] Finished Job: ${job.id}`);
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
console.log(`\nEnqueuing ${jobsToRun} mixed jobs...`);
|
|
32
|
+
// Enqueue 10 mixed jobs
|
|
33
|
+
// [Success, Success, FAIL, SlowSuccess, FAIL, Success, Success, SlowSuccess, FAIL, Success] -> 7 Success, 3 Fail
|
|
34
|
+
const jobConfig = [
|
|
35
|
+
{ type: "clean", meta: "SUCCESS" }, // 1
|
|
36
|
+
{ type: "prepare", meta: "SUCCESS" }, // 2
|
|
37
|
+
{ type: "fusion", meta: "FAIL" }, // 3
|
|
38
|
+
{ type: "clean", meta: "SLOW" }, // 4
|
|
39
|
+
{ type: "prepare", meta: "FAIL" }, // 5
|
|
40
|
+
{ type: "fusion", meta: "SUCCESS" }, // 6
|
|
41
|
+
{ type: "clean", meta: "SUCCESS" }, // 7
|
|
42
|
+
{ type: "prepare", meta: "SLOW" }, // 8
|
|
43
|
+
{ type: "fusion", meta: "FAIL" }, // 9
|
|
44
|
+
{ type: "clean", meta: "SUCCESS" } // 10
|
|
45
|
+
];
|
|
46
|
+
for (const conf of jobConfig) {
|
|
47
|
+
// Fix: Use maxAttempts: 1 to avoid retries and get immediate metrics
|
|
48
|
+
manager.createJob(conf.type, 0, conf.meta, 1);
|
|
49
|
+
}
|
|
50
|
+
// Monitor for completion
|
|
51
|
+
return new Promise((resolve) => {
|
|
52
|
+
const interval = setInterval(() => {
|
|
53
|
+
const currentStats = stats.getStats();
|
|
54
|
+
const totalFinished = Object.values(currentStats).reduce((acc, s) => acc + s.successCount + s.failureCount, 0);
|
|
55
|
+
if (totalFinished >= jobsToRun) {
|
|
56
|
+
clearInterval(interval);
|
|
57
|
+
verify(stats, errorTracker, resolve);
|
|
58
|
+
}
|
|
59
|
+
}, 500);
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
function verify(stats, errorTracker, resolve) {
|
|
63
|
+
console.log("\n--- Verification Results ---");
|
|
64
|
+
const prometheus = stats.getPrometheusMetrics();
|
|
65
|
+
const summary = stats.getStats();
|
|
66
|
+
const actualSuccess = Object.values(summary).reduce((acc, s) => acc + s.successCount, 0);
|
|
67
|
+
const actualFailure = Object.values(summary).reduce((acc, s) => acc + s.failureCount, 0);
|
|
68
|
+
console.log(`Measured Success: ${actualSuccess} (Expected 7)`);
|
|
69
|
+
console.log(`Measured Failure: ${actualFailure} (Expected 3)`);
|
|
70
|
+
let failed = false;
|
|
71
|
+
if (actualSuccess === 7 && actualFailure === 3) {
|
|
72
|
+
console.log("✅ Jobs processed/hour (total counts) verified.");
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
console.error("❌ Stats mismatch.");
|
|
76
|
+
failed = true;
|
|
77
|
+
}
|
|
78
|
+
// 2. Verify Latency Histogram
|
|
79
|
+
const hasLatency = prometheus.includes("job_duration_seconds_sum");
|
|
80
|
+
const maxLatencyClean = summary.clean?.maxDuration;
|
|
81
|
+
console.log(`Max Latency (clean): ${maxLatencyClean} (Expected > 100ms)`);
|
|
82
|
+
if (hasLatency && parseFloat(maxLatencyClean) > 0) {
|
|
83
|
+
console.log("✅ Latency histogram contains recorded values.");
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
console.error("❌ Latency data missing.");
|
|
87
|
+
failed = true;
|
|
88
|
+
}
|
|
89
|
+
// 3. Mock Sentry / Error Tracking Verification
|
|
90
|
+
console.log(`Exceptions Captured: ${errorTracker.exceptions.length} (Expected 3)`);
|
|
91
|
+
if (errorTracker.exceptions.length === 3) {
|
|
92
|
+
console.log("✅ Error tracking successfully logged all failed jobs.");
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
console.error("❌ Error tracking mismatch.");
|
|
96
|
+
failed = true;
|
|
97
|
+
}
|
|
98
|
+
if (!failed) {
|
|
99
|
+
console.log("\n✅ Success: Enhanced observability verification complete.");
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
console.error("\n❌ Failure: Observability verification failed.");
|
|
103
|
+
}
|
|
104
|
+
resolve();
|
|
105
|
+
}
|
|
106
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { MetadataPackager } from "../export/packager.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Metadata Packager Test ===\n");
|
|
6
|
+
const packager = new MetadataPackager();
|
|
7
|
+
const testDir = path.join(process.cwd(), "test_package_input");
|
|
8
|
+
const outputDir = path.join(process.cwd(), "test_package_output");
|
|
9
|
+
if (!fs.existsSync(testDir))
|
|
10
|
+
fs.mkdirSync(testDir);
|
|
11
|
+
if (fs.existsSync(outputDir))
|
|
12
|
+
fs.rmSync(outputDir, { recursive: true, force: true });
|
|
13
|
+
// 1. Create Mock Data
|
|
14
|
+
const dataPath = path.join(testDir, "train.csv");
|
|
15
|
+
fs.writeFileSync(dataPath, "id,value\n1,10\n2,20\n");
|
|
16
|
+
const qualityReport = {
|
|
17
|
+
score: 85,
|
|
18
|
+
issues: ["Missing values in column 'label'"]
|
|
19
|
+
};
|
|
20
|
+
const metadata = {
|
|
21
|
+
name: "Test Dataset",
|
|
22
|
+
version: "1.0.0",
|
|
23
|
+
description: "A test dataset for packaging",
|
|
24
|
+
license: "MIT",
|
|
25
|
+
author: "Vesper Team",
|
|
26
|
+
tags: ["test", "ml"]
|
|
27
|
+
};
|
|
28
|
+
// 2. Create Package
|
|
29
|
+
console.log("Creating package...");
|
|
30
|
+
const result = await packager.createPackage(outputDir, [{ path: dataPath, format: "csv", name: "training-data" }], metadata, { qualityReport });
|
|
31
|
+
console.log("Result:", JSON.stringify(result, null, 2));
|
|
32
|
+
if (result.success) {
|
|
33
|
+
console.log("\nVerifying files in output...");
|
|
34
|
+
const files = fs.readdirSync(outputDir);
|
|
35
|
+
console.log("Files found:", files);
|
|
36
|
+
if (files.includes("datapackage.json") && files.includes("train.csv") && files.includes("quality_report.json")) {
|
|
37
|
+
console.log("PASS: All files present in package");
|
|
38
|
+
const manifest = JSON.parse(fs.readFileSync(path.join(outputDir, "datapackage.json"), "utf8"));
|
|
39
|
+
console.log("Manifest resources count:", manifest.resources.length);
|
|
40
|
+
if (manifest.resources.length === 2 && manifest.resources[0].hash.startsWith("sha256:")) {
|
|
41
|
+
console.log("PASS: Manifest entry and hash verified");
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
console.error("FAIL: Missing files in package");
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
console.error("FAIL: Packaging failed:", result.error);
|
|
50
|
+
}
|
|
51
|
+
// Cleanup
|
|
52
|
+
// fs.rmSync(testDir, { recursive: true, force: true });
|
|
53
|
+
// fs.rmSync(outputDir, { recursive: true, force: true });
|
|
54
|
+
}
|
|
55
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Full Pipeline Test ===\n");
|
|
6
|
+
const executor = new PipelineExecutor();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_pipeline.csv");
|
|
8
|
+
// Create Dirty Data
|
|
9
|
+
// - duplicate: Duplicate row
|
|
10
|
+
// - empty: 100% missing (should drop)
|
|
11
|
+
// - price: "100.5" string (should fix type)
|
|
12
|
+
const csvContent = `id,price,empty,desc
|
|
13
|
+
1,"100.5",,A
|
|
14
|
+
2,"50.2",,B
|
|
15
|
+
1,"100.5",,A
|
|
16
|
+
3,"75.0",,C`;
|
|
17
|
+
fs.writeFileSync(testFile, csvContent);
|
|
18
|
+
console.log(`Created dirty file: ${testFile}`);
|
|
19
|
+
try {
|
|
20
|
+
console.log("\nRunning Pipeline...");
|
|
21
|
+
const result = await executor.runPipeline("test-pipeline-ds", testFile);
|
|
22
|
+
console.log("\n=== Pipeline Report ===");
|
|
23
|
+
console.log(`Plan Generated: ${result.plan.operations.length} operations`);
|
|
24
|
+
// Assertions
|
|
25
|
+
const ops = result.plan.operations.map(o => o.type);
|
|
26
|
+
console.log("Operations:", ops);
|
|
27
|
+
const hasDedupe = ops.includes("RemoveDuplicates");
|
|
28
|
+
const hasDrop = ops.includes("DropColumns");
|
|
29
|
+
const hasFix = ops.includes("FixTypes");
|
|
30
|
+
if (hasDedupe && hasDrop && hasFix) {
|
|
31
|
+
console.log("Plan Logic: PASS (Generated precise cleaning steps)");
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
console.error("Plan Logic: FAIL (Missed some cleaning steps)");
|
|
35
|
+
}
|
|
36
|
+
if (result.cleaning_result.success && fs.existsSync(result.final_output_path)) {
|
|
37
|
+
console.log("Execution: PASS (Created cleaned file)");
|
|
38
|
+
// Cleanup
|
|
39
|
+
fs.unlinkSync(testFile);
|
|
40
|
+
fs.unlinkSync(result.final_output_path);
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
console.error("Execution: FAIL");
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
catch (error) {
|
|
47
|
+
console.error("\nTest failed:", error);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { CleaningPlanner } from "../cleaning/planner.js";
|
|
2
|
+
async function main() {
|
|
3
|
+
console.log("=== Vesper Cleaning Planner Test ===\n");
|
|
4
|
+
const planner = new CleaningPlanner();
|
|
5
|
+
// Mock Report
|
|
6
|
+
const mockReport = {
|
|
7
|
+
row_count: 1000,
|
|
8
|
+
column_count: 4,
|
|
9
|
+
duplicate_rows: 50,
|
|
10
|
+
duplicate_percentage: 5.0,
|
|
11
|
+
columns: [
|
|
12
|
+
{
|
|
13
|
+
name: "id", type: "Int64", inferred_type: "Int64",
|
|
14
|
+
missing_count: 0, missing_percentage: 0, unique_count: 1000,
|
|
15
|
+
is_constant: false, is_mixed_type: false
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
name: "empty_col", type: "String", inferred_type: "String",
|
|
19
|
+
missing_count: 950, missing_percentage: 95.0, unique_count: 1,
|
|
20
|
+
is_constant: false, is_mixed_type: false
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: "age", type: "String", inferred_type: "Numeric (Stored as String)",
|
|
24
|
+
missing_count: 10, missing_percentage: 1.0, unique_count: 50,
|
|
25
|
+
is_constant: false, is_mixed_type: false
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
name: "score", type: "Float64", inferred_type: "Float64",
|
|
29
|
+
missing_count: 5, missing_percentage: 0.5, unique_count: 900,
|
|
30
|
+
is_constant: false, is_mixed_type: false
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
warnings: [],
|
|
34
|
+
schema_warnings: [],
|
|
35
|
+
overall_score: 50
|
|
36
|
+
};
|
|
37
|
+
console.log("Input Scenario:");
|
|
38
|
+
console.log("- 50 duplicate rows -> Expect RemoveDuplicates");
|
|
39
|
+
console.log("- 'empty_col' (95% missing) -> Expect DropColumns");
|
|
40
|
+
console.log("- 'age' (String but looks Numeric) -> Expect FixTypes");
|
|
41
|
+
console.log("- 'score' (Missing values) -> Expect FillMissing");
|
|
42
|
+
const plan = await planner.generatePlan("test-dataset", mockReport);
|
|
43
|
+
console.log("\n=== Generated Plan ===");
|
|
44
|
+
plan.operations.forEach((op, i) => {
|
|
45
|
+
console.log(`${i + 1}. [${op.type}] Reason: ${op.reason}`);
|
|
46
|
+
if (op.params && Object.keys(op.params).length > 0) {
|
|
47
|
+
console.log(` Params: ${JSON.stringify(op.params)}`);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
// Verification
|
|
51
|
+
const types = plan.operations.map(o => o.type);
|
|
52
|
+
const hasDedupe = types.includes("RemoveDuplicates");
|
|
53
|
+
const hasDrop = types.includes("DropColumns");
|
|
54
|
+
const hasFix = types.includes("FixTypes");
|
|
55
|
+
const hasFill = types.includes("FillMissing");
|
|
56
|
+
if (hasDedupe && hasDrop && hasFix && hasFill) {
|
|
57
|
+
console.log("\nTest passed! All expected operations generated.");
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
console.error("\nTest failed! Missing expected operations.");
|
|
61
|
+
console.log("Found:", types);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Privacy Scanner Test ===\n");
|
|
6
|
+
const analyzer = new QualityAnalyzer();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_privacy.csv");
|
|
8
|
+
// Create a CSV with PII
|
|
9
|
+
const csvContent = `id,name,contact,notes
|
|
10
|
+
1,John Doe,john@example.com,"User IP: 192.168.1.1"
|
|
11
|
+
2,Jane Smith,(555) 123-4567,"SSN is 123-45-6789"
|
|
12
|
+
3,Bob Jones,bob@work.com,"Safe notes"`;
|
|
13
|
+
fs.writeFileSync(testFile, csvContent);
|
|
14
|
+
console.log(`Created test file: ${testFile}`);
|
|
15
|
+
console.log("Expectations:");
|
|
16
|
+
console.log(" - Should detect 'Email' in 'contact' column");
|
|
17
|
+
console.log(" - Should detect 'Phone' in 'contact' column");
|
|
18
|
+
console.log(" - Should detect 'SSN' in 'notes' column");
|
|
19
|
+
console.log(" - Should detect 'IPv4' in 'notes' column");
|
|
20
|
+
try {
|
|
21
|
+
console.log("\nRunning analyzer...");
|
|
22
|
+
const report = await analyzer.analyze(testFile);
|
|
23
|
+
console.log("\n=== Privacy Report ===");
|
|
24
|
+
if (report.pii_warnings && report.pii_warnings.length > 0) {
|
|
25
|
+
report.pii_warnings.forEach(w => console.log(`[!] ${w}`));
|
|
26
|
+
console.log("\nTest passed! PII detected.");
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
console.error("\nTest failed! No PII detected.");
|
|
30
|
+
}
|
|
31
|
+
// Cleanup
|
|
32
|
+
fs.unlinkSync(testFile);
|
|
33
|
+
}
|
|
34
|
+
catch (error) {
|
|
35
|
+
console.error("\nTest failed:", error);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Quality Analysis Test ===\n");
|
|
6
|
+
const analyzer = new QualityAnalyzer();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_data.csv");
|
|
8
|
+
// 1. Create a dummy CSV with some issues
|
|
9
|
+
const csvContent = `id,age,score,category
|
|
10
|
+
1,25,90.5,A
|
|
11
|
+
2,30,88.0,B
|
|
12
|
+
3,,75.0,A
|
|
13
|
+
4,25,90.5,A
|
|
14
|
+
5,150,12.0,C
|
|
15
|
+
6,40,99.9,B`;
|
|
16
|
+
fs.writeFileSync(testFile, csvContent);
|
|
17
|
+
console.log(`Created test file: ${testFile}`);
|
|
18
|
+
console.log("Data contains: 1 missing value, 1 duplicate row, 1 outlier (age 150)\n");
|
|
19
|
+
try {
|
|
20
|
+
console.log("Running analyzer...");
|
|
21
|
+
const report = await analyzer.analyze(testFile);
|
|
22
|
+
console.log("\n=== Analysis Report ===");
|
|
23
|
+
console.log(`Rows: ${report.row_count}, Columns: ${report.column_count}`);
|
|
24
|
+
console.log(`Duplicates: ${report.duplicate_rows} (${report.duplicate_percentage.toFixed(1)}%)`);
|
|
25
|
+
console.log("\nColumn Details:");
|
|
26
|
+
for (const col of report.columns) {
|
|
27
|
+
let details = "";
|
|
28
|
+
if (col.missing_count > 0)
|
|
29
|
+
details += `Missing: ${col.missing_count} `;
|
|
30
|
+
if (col.distribution)
|
|
31
|
+
details += `Mean: ${col.distribution.mean.toFixed(1)} `;
|
|
32
|
+
console.log(` - ${col.name} (${col.type}): ${details}`);
|
|
33
|
+
}
|
|
34
|
+
console.log("\nWarnings:", report.warnings);
|
|
35
|
+
// Cleanup
|
|
36
|
+
fs.unlinkSync(testFile);
|
|
37
|
+
console.log("\nTest passed!");
|
|
38
|
+
}
|
|
39
|
+
catch (error) {
|
|
40
|
+
console.error("\nTest failed:", error);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { RobustDownloader } from "../utils/downloader.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function runTest() {
|
|
5
|
+
console.log("--- Testing Robust Ingestion (Resume Support) ---");
|
|
6
|
+
const downloader = new RobustDownloader();
|
|
7
|
+
const testDir = path.resolve("./data/test-ingestion");
|
|
8
|
+
if (!fs.existsSync(testDir))
|
|
9
|
+
fs.mkdirSync(testDir, { recursive: true });
|
|
10
|
+
const targetPath = path.join(testDir, "test-file.txt");
|
|
11
|
+
const testUrl = "https://raw.githubusercontent.com/google/guava/master/LICENSE";
|
|
12
|
+
// 1. Pre-fill with partial garbage content
|
|
13
|
+
console.log("Simulating a partial download (first 50 bytes)...");
|
|
14
|
+
fs.writeFileSync(targetPath, "This is some existing content that should be ignored by Range 0 but appended if we resume.");
|
|
15
|
+
const initialSize = fs.statSync(targetPath).size;
|
|
16
|
+
console.log(`Initial file size: ${initialSize} bytes`);
|
|
17
|
+
// 2. Attempt Download with Resume
|
|
18
|
+
console.log("Starting downloader with resume=true...");
|
|
19
|
+
try {
|
|
20
|
+
await downloader.download(testUrl, targetPath, {
|
|
21
|
+
resume: true,
|
|
22
|
+
onProgress: (bytes, total) => {
|
|
23
|
+
console.log(`Progress: ${bytes}/${total} bytes`);
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
const finalSize = fs.statSync(targetPath).size;
|
|
27
|
+
console.log(`Final file size: ${finalSize} bytes`);
|
|
28
|
+
if (finalSize > initialSize) {
|
|
29
|
+
console.log("\nVERIFICATION_STATUS: ✅ PASS (Resumed and finished)");
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
console.log("\nVERIFICATION_STATUS: ❌ FAIL (File did not grow)");
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
catch (e) {
|
|
36
|
+
console.error("Download failed:", e.message);
|
|
37
|
+
// Note: github raw might not support ranges perfectly, but the logic check stands
|
|
38
|
+
console.log("\nVERIFICATION_STATUS: ❌ FAIL");
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Schema Validation Test ===\n");
|
|
6
|
+
const analyzer = new QualityAnalyzer();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_schema.csv");
|
|
8
|
+
// Create a CSV with schema issues
|
|
9
|
+
// 'price': stored as string (should be numeric)
|
|
10
|
+
// 'mixed': completely broken mixed types
|
|
11
|
+
const csvContent = `id,price,mixed,valid_str
|
|
12
|
+
1,"100.5",10,A
|
|
13
|
+
2,"50.2","text",B
|
|
14
|
+
3,"75.0",20,A
|
|
15
|
+
4,"200",30,C
|
|
16
|
+
5,"10.99","oops",B`;
|
|
17
|
+
fs.writeFileSync(testFile, csvContent);
|
|
18
|
+
console.log(`Created test file: ${testFile}`);
|
|
19
|
+
console.log("Expectations:");
|
|
20
|
+
console.log(" - 'price' should be detected as Numeric (Stored as String)");
|
|
21
|
+
console.log(" - 'mixed' should be flagged as Mixed Type");
|
|
22
|
+
try {
|
|
23
|
+
console.log("\nRunning analyzer...");
|
|
24
|
+
const report = await analyzer.analyze(testFile);
|
|
25
|
+
console.log("\n=== Schema Report ===");
|
|
26
|
+
for (const col of report.columns) {
|
|
27
|
+
console.log(`Column: ${col.name}`);
|
|
28
|
+
console.log(` Type: ${col.type}`);
|
|
29
|
+
console.log(` Inferred: ${col.inferred_type}`);
|
|
30
|
+
if (col.is_mixed_type)
|
|
31
|
+
console.log(` [!] Mixed Type Detected`);
|
|
32
|
+
if (col.inferred_type !== col.type && !col.type.includes(col.inferred_type)) {
|
|
33
|
+
console.log(` [!] Type Mismatch Detected`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
console.log("\nSchema Warnings:", report.schema_warnings);
|
|
37
|
+
// Cleanup
|
|
38
|
+
fs.unlinkSync(testFile);
|
|
39
|
+
console.log("\nTest passed!");
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
console.error("\nTest failed:", error);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Split Validation Test ===\n");
|
|
6
|
+
const splitter = new DataSplitter();
|
|
7
|
+
const trainPath = path.join(process.cwd(), "mock_train.csv");
|
|
8
|
+
const valPath = path.join(process.cwd(), "mock_val.csv");
|
|
9
|
+
const testPath = path.join(process.cwd(), "mock_test.csv");
|
|
10
|
+
// 1. Create splits with LEAKAGE (ID 1 exists in train and val)
|
|
11
|
+
// And DRIFT (Train is all 'A', Val is all 'B')
|
|
12
|
+
fs.writeFileSync(trainPath, "id,label\n1,A\n2,A\n3,A");
|
|
13
|
+
fs.writeFileSync(valPath, "id,label\n1,B\n4,B\n5,B"); // ID 1 is leak
|
|
14
|
+
fs.writeFileSync(testPath, "id,label\n6,A\n7,B");
|
|
15
|
+
try {
|
|
16
|
+
console.log("Running validator...");
|
|
17
|
+
const report = await splitter.validate({ train: trainPath, val: valPath, test: testPath }, { id_column: "id", target_column: "label" });
|
|
18
|
+
console.log("\n=== Validation Report ===");
|
|
19
|
+
console.log(`Leakage Detected: ${report.leakage_detected} (Count: ${report.leakage_count})`);
|
|
20
|
+
console.log(`Distribution Mismatch: ${report.distribution_mismatch}`);
|
|
21
|
+
console.log("Warnings:");
|
|
22
|
+
report.warnings.forEach(w => console.log(` - ${w}`));
|
|
23
|
+
// Assertions
|
|
24
|
+
if (report.leakage_detected && report.distribution_mismatch) {
|
|
25
|
+
console.log("\nPASS: Detected both Leakage and Drift.");
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
console.error("\nFAIL: Missed issues.");
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
catch (e) {
|
|
32
|
+
console.error("Test failed:", e);
|
|
33
|
+
}
|
|
34
|
+
finally {
|
|
35
|
+
// Cleanup
|
|
36
|
+
[trainPath, valPath, testPath].forEach(p => { if (fs.existsSync(p))
|
|
37
|
+
fs.unlinkSync(p); });
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Data Splitter Test ===\n");
|
|
6
|
+
const splitter = new DataSplitter();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_splitting.csv");
|
|
8
|
+
// Create Dummy Data
|
|
9
|
+
// - id: 1..100
|
|
10
|
+
// - label: A (50), B (50) for stratification
|
|
11
|
+
// - date: sequential dates for time split
|
|
12
|
+
let csvContent = "id,label,date\n";
|
|
13
|
+
for (let i = 0; i < 50; i++)
|
|
14
|
+
csvContent += `${i},A,2023-01-${(i % 30) + 1}\n`;
|
|
15
|
+
for (let i = 50; i < 100; i++)
|
|
16
|
+
csvContent += `${i},B,2023-02-${(i % 28) + 1}\n`;
|
|
17
|
+
fs.writeFileSync(testFile, csvContent);
|
|
18
|
+
console.log(`Created test file: ${testFile}`);
|
|
19
|
+
console.log("Rows: 100 (50 A, 50 B)");
|
|
20
|
+
// Test 1: Random Split (80/10/10)
|
|
21
|
+
console.log("\n--- Test 1: Random Split (80/10/10) ---");
|
|
22
|
+
const config1 = {
|
|
23
|
+
type: "random",
|
|
24
|
+
ratios: { train: 0.8, val: 0.1, test: 0.1, holdout: 0 },
|
|
25
|
+
shuffle: true,
|
|
26
|
+
random_seed: 42
|
|
27
|
+
};
|
|
28
|
+
try {
|
|
29
|
+
const result1 = await splitter.split(testFile, config1);
|
|
30
|
+
console.log("Stats:", result1.stats);
|
|
31
|
+
if (result1.stats.train_rows === 80 && result1.stats.val_rows === 10 && result1.stats.test_rows === 10) {
|
|
32
|
+
console.log("PASS: Ratios preserved");
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
console.error("FAIL: Incorrect ratios");
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
catch (e) {
|
|
39
|
+
console.error(e);
|
|
40
|
+
}
|
|
41
|
+
// Test 2: Stratified Split (60/20/20) on 'label'
|
|
42
|
+
console.log("\n--- Test 2: Stratified Split (60/20/20) ---");
|
|
43
|
+
const config2 = {
|
|
44
|
+
type: "stratified",
|
|
45
|
+
ratios: { train: 0.6, val: 0.2, test: 0.2, holdout: 0 },
|
|
46
|
+
target_column: "label",
|
|
47
|
+
random_seed: 42
|
|
48
|
+
};
|
|
49
|
+
try {
|
|
50
|
+
const result2 = await splitter.split(testFile, config2);
|
|
51
|
+
console.log("Stats:", result2.stats);
|
|
52
|
+
if (result2.stats.train_rows === 60 && result2.stats.val_rows === 20 && result2.stats.test_rows === 20) {
|
|
53
|
+
console.log("PASS: Ratios preserved");
|
|
54
|
+
// In a deeper test we would verify the class distribution too, but simple count is good for now
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
console.error("FAIL: Incorrect ratios");
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
catch (e) {
|
|
61
|
+
console.error(e);
|
|
62
|
+
}
|
|
63
|
+
// Test 3: Time Split (70/15/15) on 'date'
|
|
64
|
+
console.log("\n--- Test 3: Time Split (70/15/15) ---");
|
|
65
|
+
const config3 = {
|
|
66
|
+
type: "time",
|
|
67
|
+
ratios: { train: 0.7, val: 0.15, test: 0.15, holdout: 0 },
|
|
68
|
+
time_column: "date"
|
|
69
|
+
};
|
|
70
|
+
try {
|
|
71
|
+
const result3 = await splitter.split(testFile, config3);
|
|
72
|
+
console.log("Stats:", result3.stats);
|
|
73
|
+
// Clean up generated files
|
|
74
|
+
fs.unlinkSync(result3.paths.train);
|
|
75
|
+
fs.unlinkSync(result3.paths.val);
|
|
76
|
+
fs.unlinkSync(result3.paths.test);
|
|
77
|
+
// Clean up previous test files too (using known naming convention)
|
|
78
|
+
const base = testFile.replace(".csv", "");
|
|
79
|
+
if (fs.existsSync(base + "_train.csv"))
|
|
80
|
+
fs.unlinkSync(base + "_train.csv");
|
|
81
|
+
if (fs.existsSync(base + "_val.csv"))
|
|
82
|
+
fs.unlinkSync(base + "_val.csv");
|
|
83
|
+
if (fs.existsSync(base + "_test.csv"))
|
|
84
|
+
fs.unlinkSync(base + "_test.csv");
|
|
85
|
+
console.log("PASS: Execution successful");
|
|
86
|
+
}
|
|
87
|
+
catch (e) {
|
|
88
|
+
console.error(e);
|
|
89
|
+
}
|
|
90
|
+
// Cleanup input
|
|
91
|
+
fs.unlinkSync(testFile);
|
|
92
|
+
}
|
|
93
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
2
|
+
async function runTest() {
|
|
3
|
+
console.log("--- Testing UCI integration ---");
|
|
4
|
+
const scraper = new UCIScraper();
|
|
5
|
+
const query = "iris";
|
|
6
|
+
console.log(`Searching UCI for: "${query}"...`);
|
|
7
|
+
const results = await scraper.scrape(query, 5);
|
|
8
|
+
console.log(`Found ${results.length} datasets.`);
|
|
9
|
+
if (results.length > 0) {
|
|
10
|
+
console.log("✅ UCI Scraper returned results.");
|
|
11
|
+
console.log("Sample Result:");
|
|
12
|
+
console.log(JSON.stringify(results[0], null, 2));
|
|
13
|
+
// Validation
|
|
14
|
+
const sample = results[0];
|
|
15
|
+
if (sample.id.startsWith("uci:") && sample.source === "uci") {
|
|
16
|
+
console.log("✅ Metadata schema validation passed.");
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
console.error("❌ Metadata schema validation failed.");
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
console.error("❌ No results found. Ensure python and ucimlrepo are installed.");
|
|
24
|
+
}
|
|
25
|
+
console.log("--- Test Complete ---");
|
|
26
|
+
}
|
|
27
|
+
runTest().catch(console.error);
|