vesper-wizard 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function main() {
|
|
5
|
-
console.log("=== Vesper Quality Analysis Test ===\n");
|
|
6
|
-
const analyzer = new QualityAnalyzer();
|
|
7
|
-
const testFile = path.join(process.cwd(), "test_data.csv");
|
|
8
|
-
// 1. Create a dummy CSV with some issues
|
|
9
|
-
const csvContent = `id,age,score,category
|
|
10
|
-
1,25,90.5,A
|
|
11
|
-
2,30,88.0,B
|
|
12
|
-
3,,75.0,A
|
|
13
|
-
4,25,90.5,A
|
|
14
|
-
5,150,12.0,C
|
|
15
|
-
6,40,99.9,B`;
|
|
16
|
-
fs.writeFileSync(testFile, csvContent);
|
|
17
|
-
console.log(`Created test file: ${testFile}`);
|
|
18
|
-
console.log("Data contains: 1 missing value, 1 duplicate row, 1 outlier (age 150)\n");
|
|
19
|
-
try {
|
|
20
|
-
console.log("Running analyzer...");
|
|
21
|
-
const report = await analyzer.analyze(testFile);
|
|
22
|
-
console.log("\n=== Analysis Report ===");
|
|
23
|
-
console.log(`Rows: ${report.row_count}, Columns: ${report.column_count}`);
|
|
24
|
-
console.log(`Duplicates: ${report.duplicate_rows} (${report.duplicate_percentage.toFixed(1)}%)`);
|
|
25
|
-
console.log("\nColumn Details:");
|
|
26
|
-
for (const col of report.columns) {
|
|
27
|
-
let details = "";
|
|
28
|
-
if (col.missing_count > 0)
|
|
29
|
-
details += `Missing: ${col.missing_count} `;
|
|
30
|
-
if (col.distribution)
|
|
31
|
-
details += `Mean: ${col.distribution.mean.toFixed(1)} `;
|
|
32
|
-
console.log(` - ${col.name} (${col.type}): ${details}`);
|
|
33
|
-
}
|
|
34
|
-
console.log("\nWarnings:", report.warnings);
|
|
35
|
-
// Cleanup
|
|
36
|
-
fs.unlinkSync(testFile);
|
|
37
|
-
console.log("\nTest passed!");
|
|
38
|
-
}
|
|
39
|
-
catch (error) {
|
|
40
|
-
console.error("\nTest failed:", error);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
main().catch(console.error);
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { RobustDownloader } from "../utils/downloader.js";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function runTest() {
|
|
5
|
-
console.log("--- Testing Robust Ingestion (Resume Support) ---");
|
|
6
|
-
const downloader = new RobustDownloader();
|
|
7
|
-
const testDir = path.resolve("./data/test-ingestion");
|
|
8
|
-
if (!fs.existsSync(testDir))
|
|
9
|
-
fs.mkdirSync(testDir, { recursive: true });
|
|
10
|
-
const targetPath = path.join(testDir, "test-file.txt");
|
|
11
|
-
const testUrl = "https://raw.githubusercontent.com/google/guava/master/LICENSE";
|
|
12
|
-
// 1. Pre-fill with partial garbage content
|
|
13
|
-
console.log("Simulating a partial download (first 50 bytes)...");
|
|
14
|
-
fs.writeFileSync(targetPath, "This is some existing content that should be ignored by Range 0 but appended if we resume.");
|
|
15
|
-
const initialSize = fs.statSync(targetPath).size;
|
|
16
|
-
console.log(`Initial file size: ${initialSize} bytes`);
|
|
17
|
-
// 2. Attempt Download with Resume
|
|
18
|
-
console.log("Starting downloader with resume=true...");
|
|
19
|
-
try {
|
|
20
|
-
await downloader.download(testUrl, targetPath, {
|
|
21
|
-
resume: true,
|
|
22
|
-
onProgress: (bytes, total) => {
|
|
23
|
-
console.log(`Progress: ${bytes}/${total} bytes`);
|
|
24
|
-
}
|
|
25
|
-
});
|
|
26
|
-
const finalSize = fs.statSync(targetPath).size;
|
|
27
|
-
console.log(`Final file size: ${finalSize} bytes`);
|
|
28
|
-
if (finalSize > initialSize) {
|
|
29
|
-
console.log("\nVERIFICATION_STATUS: ✅ PASS (Resumed and finished)");
|
|
30
|
-
}
|
|
31
|
-
else {
|
|
32
|
-
console.log("\nVERIFICATION_STATUS: ❌ FAIL (File did not grow)");
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
catch (e) {
|
|
36
|
-
console.error("Download failed:", e.message);
|
|
37
|
-
// Note: github raw might not support ranges perfectly, but the logic check stands
|
|
38
|
-
console.log("\nVERIFICATION_STATUS: ❌ FAIL");
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
runTest().catch(console.error);
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function main() {
|
|
5
|
-
console.log("=== Vesper Schema Validation Test ===\n");
|
|
6
|
-
const analyzer = new QualityAnalyzer();
|
|
7
|
-
const testFile = path.join(process.cwd(), "test_schema.csv");
|
|
8
|
-
// Create a CSV with schema issues
|
|
9
|
-
// 'price': stored as string (should be numeric)
|
|
10
|
-
// 'mixed': completely broken mixed types
|
|
11
|
-
const csvContent = `id,price,mixed,valid_str
|
|
12
|
-
1,"100.5",10,A
|
|
13
|
-
2,"50.2","text",B
|
|
14
|
-
3,"75.0",20,A
|
|
15
|
-
4,"200",30,C
|
|
16
|
-
5,"10.99","oops",B`;
|
|
17
|
-
fs.writeFileSync(testFile, csvContent);
|
|
18
|
-
console.log(`Created test file: ${testFile}`);
|
|
19
|
-
console.log("Expectations:");
|
|
20
|
-
console.log(" - 'price' should be detected as Numeric (Stored as String)");
|
|
21
|
-
console.log(" - 'mixed' should be flagged as Mixed Type");
|
|
22
|
-
try {
|
|
23
|
-
console.log("\nRunning analyzer...");
|
|
24
|
-
const report = await analyzer.analyze(testFile);
|
|
25
|
-
console.log("\n=== Schema Report ===");
|
|
26
|
-
for (const col of report.columns) {
|
|
27
|
-
console.log(`Column: ${col.name}`);
|
|
28
|
-
console.log(` Type: ${col.type}`);
|
|
29
|
-
console.log(` Inferred: ${col.inferred_type}`);
|
|
30
|
-
if (col.is_mixed_type)
|
|
31
|
-
console.log(` [!] Mixed Type Detected`);
|
|
32
|
-
if (col.inferred_type !== col.type && !col.type.includes(col.inferred_type)) {
|
|
33
|
-
console.log(` [!] Type Mismatch Detected`);
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
console.log("\nSchema Warnings:", report.schema_warnings);
|
|
37
|
-
// Cleanup
|
|
38
|
-
fs.unlinkSync(testFile);
|
|
39
|
-
console.log("\nTest passed!");
|
|
40
|
-
}
|
|
41
|
-
catch (error) {
|
|
42
|
-
console.error("\nTest failed:", error);
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
main().catch(console.error);
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function main() {
|
|
5
|
-
console.log("=== Vesper Split Validation Test ===\n");
|
|
6
|
-
const splitter = new DataSplitter();
|
|
7
|
-
const trainPath = path.join(process.cwd(), "mock_train.csv");
|
|
8
|
-
const valPath = path.join(process.cwd(), "mock_val.csv");
|
|
9
|
-
const testPath = path.join(process.cwd(), "mock_test.csv");
|
|
10
|
-
// 1. Create splits with LEAKAGE (ID 1 exists in train and val)
|
|
11
|
-
// And DRIFT (Train is all 'A', Val is all 'B')
|
|
12
|
-
fs.writeFileSync(trainPath, "id,label\n1,A\n2,A\n3,A");
|
|
13
|
-
fs.writeFileSync(valPath, "id,label\n1,B\n4,B\n5,B"); // ID 1 is leak
|
|
14
|
-
fs.writeFileSync(testPath, "id,label\n6,A\n7,B");
|
|
15
|
-
try {
|
|
16
|
-
console.log("Running validator...");
|
|
17
|
-
const report = await splitter.validate({ train: trainPath, val: valPath, test: testPath }, { id_column: "id", target_column: "label" });
|
|
18
|
-
console.log("\n=== Validation Report ===");
|
|
19
|
-
console.log(`Leakage Detected: ${report.leakage_detected} (Count: ${report.leakage_count})`);
|
|
20
|
-
console.log(`Distribution Mismatch: ${report.distribution_mismatch}`);
|
|
21
|
-
console.log("Warnings:");
|
|
22
|
-
report.warnings.forEach(w => console.log(` - ${w}`));
|
|
23
|
-
// Assertions
|
|
24
|
-
if (report.leakage_detected && report.distribution_mismatch) {
|
|
25
|
-
console.log("\nPASS: Detected both Leakage and Drift.");
|
|
26
|
-
}
|
|
27
|
-
else {
|
|
28
|
-
console.error("\nFAIL: Missed issues.");
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
catch (e) {
|
|
32
|
-
console.error("Test failed:", e);
|
|
33
|
-
}
|
|
34
|
-
finally {
|
|
35
|
-
// Cleanup
|
|
36
|
-
[trainPath, valPath, testPath].forEach(p => { if (fs.existsSync(p))
|
|
37
|
-
fs.unlinkSync(p); });
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
main().catch(console.error);
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
async function main() {
|
|
5
|
-
console.log("=== Vesper Data Splitter Test ===\n");
|
|
6
|
-
const splitter = new DataSplitter();
|
|
7
|
-
const testFile = path.join(process.cwd(), "test_splitting.csv");
|
|
8
|
-
// Create Dummy Data
|
|
9
|
-
// - id: 1..100
|
|
10
|
-
// - label: A (50), B (50) for stratification
|
|
11
|
-
// - date: sequential dates for time split
|
|
12
|
-
let csvContent = "id,label,date\n";
|
|
13
|
-
for (let i = 0; i < 50; i++)
|
|
14
|
-
csvContent += `${i},A,2023-01-${(i % 30) + 1}\n`;
|
|
15
|
-
for (let i = 50; i < 100; i++)
|
|
16
|
-
csvContent += `${i},B,2023-02-${(i % 28) + 1}\n`;
|
|
17
|
-
fs.writeFileSync(testFile, csvContent);
|
|
18
|
-
console.log(`Created test file: ${testFile}`);
|
|
19
|
-
console.log("Rows: 100 (50 A, 50 B)");
|
|
20
|
-
// Test 1: Random Split (80/10/10)
|
|
21
|
-
console.log("\n--- Test 1: Random Split (80/10/10) ---");
|
|
22
|
-
const config1 = {
|
|
23
|
-
type: "random",
|
|
24
|
-
ratios: { train: 0.8, val: 0.1, test: 0.1, holdout: 0 },
|
|
25
|
-
shuffle: true,
|
|
26
|
-
random_seed: 42
|
|
27
|
-
};
|
|
28
|
-
try {
|
|
29
|
-
const result1 = await splitter.split(testFile, config1);
|
|
30
|
-
console.log("Stats:", result1.stats);
|
|
31
|
-
if (result1.stats.train_rows === 80 && result1.stats.val_rows === 10 && result1.stats.test_rows === 10) {
|
|
32
|
-
console.log("PASS: Ratios preserved");
|
|
33
|
-
}
|
|
34
|
-
else {
|
|
35
|
-
console.error("FAIL: Incorrect ratios");
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
catch (e) {
|
|
39
|
-
console.error(e);
|
|
40
|
-
}
|
|
41
|
-
// Test 2: Stratified Split (60/20/20) on 'label'
|
|
42
|
-
console.log("\n--- Test 2: Stratified Split (60/20/20) ---");
|
|
43
|
-
const config2 = {
|
|
44
|
-
type: "stratified",
|
|
45
|
-
ratios: { train: 0.6, val: 0.2, test: 0.2, holdout: 0 },
|
|
46
|
-
target_column: "label",
|
|
47
|
-
random_seed: 42
|
|
48
|
-
};
|
|
49
|
-
try {
|
|
50
|
-
const result2 = await splitter.split(testFile, config2);
|
|
51
|
-
console.log("Stats:", result2.stats);
|
|
52
|
-
if (result2.stats.train_rows === 60 && result2.stats.val_rows === 20 && result2.stats.test_rows === 20) {
|
|
53
|
-
console.log("PASS: Ratios preserved");
|
|
54
|
-
// In a deeper test we would verify the class distribution too, but simple count is good for now
|
|
55
|
-
}
|
|
56
|
-
else {
|
|
57
|
-
console.error("FAIL: Incorrect ratios");
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
catch (e) {
|
|
61
|
-
console.error(e);
|
|
62
|
-
}
|
|
63
|
-
// Test 3: Time Split (70/15/15) on 'date'
|
|
64
|
-
console.log("\n--- Test 3: Time Split (70/15/15) ---");
|
|
65
|
-
const config3 = {
|
|
66
|
-
type: "time",
|
|
67
|
-
ratios: { train: 0.7, val: 0.15, test: 0.15, holdout: 0 },
|
|
68
|
-
time_column: "date"
|
|
69
|
-
};
|
|
70
|
-
try {
|
|
71
|
-
const result3 = await splitter.split(testFile, config3);
|
|
72
|
-
console.log("Stats:", result3.stats);
|
|
73
|
-
// Clean up generated files
|
|
74
|
-
fs.unlinkSync(result3.paths.train);
|
|
75
|
-
fs.unlinkSync(result3.paths.val);
|
|
76
|
-
fs.unlinkSync(result3.paths.test);
|
|
77
|
-
// Clean up previous test files too (using known naming convention)
|
|
78
|
-
const base = testFile.replace(".csv", "");
|
|
79
|
-
if (fs.existsSync(base + "_train.csv"))
|
|
80
|
-
fs.unlinkSync(base + "_train.csv");
|
|
81
|
-
if (fs.existsSync(base + "_val.csv"))
|
|
82
|
-
fs.unlinkSync(base + "_val.csv");
|
|
83
|
-
if (fs.existsSync(base + "_test.csv"))
|
|
84
|
-
fs.unlinkSync(base + "_test.csv");
|
|
85
|
-
console.log("PASS: Execution successful");
|
|
86
|
-
}
|
|
87
|
-
catch (e) {
|
|
88
|
-
console.error(e);
|
|
89
|
-
}
|
|
90
|
-
// Cleanup input
|
|
91
|
-
fs.unlinkSync(testFile);
|
|
92
|
-
}
|
|
93
|
-
main().catch(console.error);
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import { TargetDetector } from "../preparation/target-detector.js";
|
|
2
|
-
import path from "path";
|
|
3
|
-
async function testDetector() {
|
|
4
|
-
// 1. Test existing build dir
|
|
5
|
-
const detector = new TargetDetector(path.join(process.cwd(), "build"));
|
|
6
|
-
// 2. Create a dummy CSV for testing
|
|
7
|
-
const testFile = path.join(process.cwd(), "test_target.csv");
|
|
8
|
-
const fs = (await import("fs")).default;
|
|
9
|
-
// Test Case 1: SalePrice (Regression)
|
|
10
|
-
console.log("--- Test Case 1: SalePrice ---");
|
|
11
|
-
fs.writeFileSync(testFile, "id,feature1,feature2,SalePrice\n1,10,20,100000\n2,11,21,120000\n3,12,22,110000");
|
|
12
|
-
let result = await detector.detectTarget(testFile);
|
|
13
|
-
console.log("Detection:", result.target_column, result.confidence);
|
|
14
|
-
if (result.target_column) {
|
|
15
|
-
let val = await detector.validateTarget(testFile, result.target_column);
|
|
16
|
-
console.log("Validation:", val.problem_type, val.valid);
|
|
17
|
-
}
|
|
18
|
-
// Test Case 2: diagnosis (Classification)
|
|
19
|
-
console.log("\n--- Test Case 2: diagnosis ---");
|
|
20
|
-
fs.writeFileSync(testFile, "id,age,diagnosis\n1,50,M\n2,60,B\n3,45,M");
|
|
21
|
-
result = await detector.detectTarget(testFile);
|
|
22
|
-
console.log("Detection:", result.target_column, result.confidence);
|
|
23
|
-
if (result.target_column) {
|
|
24
|
-
let val = await detector.validateTarget(testFile, result.target_column);
|
|
25
|
-
console.log("Validation:", val.problem_type, val.valid);
|
|
26
|
-
}
|
|
27
|
-
fs.unlinkSync(testFile);
|
|
28
|
-
}
|
|
29
|
-
testDetector().catch(console.error);
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
2
|
-
async function runTest() {
|
|
3
|
-
console.log("--- Testing UCI integration ---");
|
|
4
|
-
const scraper = new UCIScraper();
|
|
5
|
-
const query = "iris";
|
|
6
|
-
console.log(`Searching UCI for: "${query}"...`);
|
|
7
|
-
const results = await scraper.scrape(query, 5);
|
|
8
|
-
console.log(`Found ${results.length} datasets.`);
|
|
9
|
-
if (results.length > 0) {
|
|
10
|
-
console.log("✅ UCI Scraper returned results.");
|
|
11
|
-
console.log("Sample Result:");
|
|
12
|
-
console.log(JSON.stringify(results[0], null, 2));
|
|
13
|
-
// Validation
|
|
14
|
-
const sample = results[0];
|
|
15
|
-
if (sample.id.startsWith("uci:") && sample.source === "uci") {
|
|
16
|
-
console.log("✅ Metadata schema validation passed.");
|
|
17
|
-
}
|
|
18
|
-
else {
|
|
19
|
-
console.error("❌ Metadata schema validation failed.");
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
else {
|
|
23
|
-
console.error("❌ No results found. Ensure python and ucimlrepo are installed.");
|
|
24
|
-
}
|
|
25
|
-
console.log("--- Test Complete ---");
|
|
26
|
-
}
|
|
27
|
-
runTest().catch(console.error);
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import { QualityOrchestrator } from "../quality/quality-orchestrator.js";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
import { execSync } from "child_process";
|
|
5
|
-
async function runTest() {
|
|
6
|
-
console.log("--- Testing Unified Quality Report ---");
|
|
7
|
-
const projectRoot = path.resolve(".");
|
|
8
|
-
const orchestrator = new QualityOrchestrator(projectRoot);
|
|
9
|
-
// Create a mixed dataset directory
|
|
10
|
-
const testDataDir = path.join(projectRoot, "data", "test-unified");
|
|
11
|
-
if (!fs.existsSync(testDataDir))
|
|
12
|
-
fs.mkdirSync(testDataDir, { recursive: true });
|
|
13
|
-
// 1. Create a CSV file (text modality)
|
|
14
|
-
const csvPath = path.join(testDataDir, "data.csv");
|
|
15
|
-
fs.writeFileSync(csvPath, "id,name,value\\n1,Alice,10\\n2,Bob,20\\n3,Charlie,30\\n");
|
|
16
|
-
// 2. Create a test image (using Python)
|
|
17
|
-
const imagePath = path.join(testDataDir, "test_image.png");
|
|
18
|
-
const pythonScript = `
|
|
19
|
-
import numpy as np
|
|
20
|
-
from PIL import Image
|
|
21
|
-
img = Image.fromarray(np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8))
|
|
22
|
-
img.save('${imagePath.replace(/\\/g, "\\\\")}')
|
|
23
|
-
`;
|
|
24
|
-
fs.writeFileSync(path.join(testDataDir, "gen_image.py"), pythonScript);
|
|
25
|
-
try {
|
|
26
|
-
execSync(`python "${path.join(testDataDir, "gen_image.py")}"`);
|
|
27
|
-
}
|
|
28
|
-
catch (e) {
|
|
29
|
-
console.log("⚠️ Could not generate image. Skipping image modality.");
|
|
30
|
-
}
|
|
31
|
-
// 3. Create a test audio file (using Python)
|
|
32
|
-
const audioPath = path.join(testDataDir, "test_audio.wav");
|
|
33
|
-
const audioScript = `
|
|
34
|
-
import numpy as np
|
|
35
|
-
import soundfile as sf
|
|
36
|
-
sample_rate = 44100
|
|
37
|
-
duration = 1.0
|
|
38
|
-
frequency = 440.0
|
|
39
|
-
t = np.linspace(0, duration, int(sample_rate * duration))
|
|
40
|
-
audio = 0.5 * np.sin(2 * np.pi * frequency * t)
|
|
41
|
-
sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
|
|
42
|
-
`;
|
|
43
|
-
fs.writeFileSync(path.join(testDataDir, "gen_audio.py"), audioScript);
|
|
44
|
-
try {
|
|
45
|
-
execSync(`python "${path.join(testDataDir, "gen_audio.py")}"`);
|
|
46
|
-
}
|
|
47
|
-
catch (e) {
|
|
48
|
-
console.log("⚠️ Could not generate audio. Skipping audio modality.");
|
|
49
|
-
}
|
|
50
|
-
// 4. Run Unified Quality Analysis
|
|
51
|
-
console.log(`\\nAnalyzing multimodal dataset at ${testDataDir}...`);
|
|
52
|
-
try {
|
|
53
|
-
const report = await orchestrator.generateReport("test-unified-dataset", testDataDir, null);
|
|
54
|
-
console.log("\\n📊 Unified Quality Report:");
|
|
55
|
-
console.log(`- Dataset ID: ${report.dataset_id}`);
|
|
56
|
-
console.log(`- Modalities: ${report.modalities.join(", ")}`);
|
|
57
|
-
console.log(`- Overall Quality Score: ${report.overall_quality_score}/100`);
|
|
58
|
-
if (report.image_quality) {
|
|
59
|
-
console.log(`\\n🖼️ Image Quality:`);
|
|
60
|
-
console.log(` - Total Images: ${report.image_quality.total_images}`);
|
|
61
|
-
console.log(` - Avg Resolution: ${report.image_quality.avg_resolution}`);
|
|
62
|
-
}
|
|
63
|
-
if (report.audio_quality) {
|
|
64
|
-
console.log(`\\n🎵 Audio Quality:`);
|
|
65
|
-
console.log(` - Total Files: ${report.audio_quality.total_files}`);
|
|
66
|
-
console.log(` - Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s`);
|
|
67
|
-
}
|
|
68
|
-
console.log(`\\n💡 Recommendations:`);
|
|
69
|
-
report.recommendations.forEach(rec => console.log(` - ${rec}`));
|
|
70
|
-
// Verify modalities detected
|
|
71
|
-
const hasText = report.modalities.includes("text");
|
|
72
|
-
const hasImage = report.modalities.includes("image");
|
|
73
|
-
const hasAudio = report.modalities.includes("audio");
|
|
74
|
-
if (hasText && (hasImage || hasAudio) && report.overall_quality_score > 0) {
|
|
75
|
-
console.log("\\nVERIFICATION_STATUS: ✅ PASS");
|
|
76
|
-
}
|
|
77
|
-
else {
|
|
78
|
-
console.log("\\nVERIFICATION_STATUS: ⚠️ PARTIAL (Some modalities missing)");
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
catch (e) {
|
|
82
|
-
console.error(`Analysis failed: ${e.message}`);
|
|
83
|
-
console.log("\\nVERIFICATION_STATUS: ❌ FAIL");
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
runTest().catch(console.error);
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
const target = "C:\\Users\\нурбулан\\AppData\\Roaming\\Code\\User\\mcp.json";
|
|
3
|
-
const content = JSON.stringify({ mcpServers: { test: { command: "node" } } }, null, 2);
|
|
4
|
-
try {
|
|
5
|
-
console.log(`Testing write to: ${target}`);
|
|
6
|
-
fs.writeFileSync(target, content, "utf8");
|
|
7
|
-
const stat = fs.statSync(target);
|
|
8
|
-
console.log(`Success! File size: ${stat.size} bytes`);
|
|
9
|
-
const readBack = fs.readFileSync(target, "utf8");
|
|
10
|
-
console.log("Read back content:", readBack);
|
|
11
|
-
}
|
|
12
|
-
catch (e) {
|
|
13
|
-
console.error("Failed to write:", e);
|
|
14
|
-
}
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import { TargetDetector } from "../preparation/target-detector.js";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
async function verifyIntegration() {
|
|
5
|
-
// 1. Create a dummy CSV with a clear target
|
|
6
|
-
const testFile = path.join(process.cwd(), "data", "raw", "integration_test.csv");
|
|
7
|
-
const testId = "integration_test";
|
|
8
|
-
if (!fs.existsSync(path.dirname(testFile))) {
|
|
9
|
-
fs.mkdirSync(path.dirname(testFile), { recursive: true });
|
|
10
|
-
}
|
|
11
|
-
console.log("Creating test file:", testFile);
|
|
12
|
-
fs.writeFileSync(testFile, "id,feature1,feature2,SalePrice\n1,10,20,100000\n2,11,21,100000\n3,12,22,110000");
|
|
13
|
-
// 2. Call the preview_cleaning tool (simulated by calling valid request handler logic or via MCP client if possible)
|
|
14
|
-
// Since we can't easily call the MCP server from here without a client, we will simulate
|
|
15
|
-
// the logic we added to index.ts to ensure it runs without error.
|
|
16
|
-
try {
|
|
17
|
-
const { QualityAnalyzer } = await import("../quality/analyzer.js");
|
|
18
|
-
const { CleaningPlanner } = await import("../cleaning/planner.js");
|
|
19
|
-
const { CacheService, MockRedisProvider } = await import("../cache/service.js");
|
|
20
|
-
console.log("Initializing services...");
|
|
21
|
-
const cacheService = new CacheService(new MockRedisProvider());
|
|
22
|
-
// Use build/ directory to simulate runtime environment
|
|
23
|
-
const buildDir = path.join(process.cwd(), "build");
|
|
24
|
-
const qualityAnalyzer = new QualityAnalyzer(cacheService, buildDir);
|
|
25
|
-
const cleaningPlanner = new CleaningPlanner(cacheService, buildDir);
|
|
26
|
-
console.log("Running analysis...");
|
|
27
|
-
const report = await qualityAnalyzer.analyze(testFile);
|
|
28
|
-
console.log("Running target detection...");
|
|
29
|
-
const detector = new TargetDetector(buildDir);
|
|
30
|
-
const targetResult = await detector.detectTarget(testFile);
|
|
31
|
-
console.log("Detected:", targetResult);
|
|
32
|
-
const targetInfo = targetResult.target_column ? {
|
|
33
|
-
target: targetResult.target_column,
|
|
34
|
-
confidence: targetResult.confidence
|
|
35
|
-
} : undefined;
|
|
36
|
-
console.log("Generating plan...");
|
|
37
|
-
const plan = await cleaningPlanner.generatePlan(testId, report, undefined, targetInfo);
|
|
38
|
-
console.log("Plan Operations:", JSON.stringify(plan.operations, null, 2));
|
|
39
|
-
const hasRename = plan.operations.some(op => op.type === "RenameTarget");
|
|
40
|
-
if (hasRename) {
|
|
41
|
-
console.log("✅ SUCCESS: RenameTarget operation found in plan!");
|
|
42
|
-
}
|
|
43
|
-
else {
|
|
44
|
-
console.error("❌ FAILURE: RenameTarget operation NOT found.");
|
|
45
|
-
process.exit(1);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
catch (e) {
|
|
49
|
-
console.error("Error during verification:", e);
|
|
50
|
-
process.exit(1);
|
|
51
|
-
}
|
|
52
|
-
finally {
|
|
53
|
-
if (fs.existsSync(testFile))
|
|
54
|
-
fs.unlinkSync(testFile);
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
verifyIntegration();
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
-
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
-
import { Embedder } from "../search/embedder.js";
|
|
5
|
-
import { SearchEngine } from "../search/engine.js";
|
|
6
|
-
import { formatSearchResults } from "../tools/formatter.js";
|
|
7
|
-
import fs from "fs";
|
|
8
|
-
const query = process.argv[2] || "anime";
|
|
9
|
-
// Use the actual .vesper data path if it exists, otherwise use local data/
|
|
10
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || process.cwd();
|
|
11
|
-
const vesperDataRoot = path.join(homeDir, ".vesper");
|
|
12
|
-
let dbPath = path.join(vesperDataRoot, "data", "metadata.db");
|
|
13
|
-
let vectorPath = path.join(vesperDataRoot, "data", "vectors.json");
|
|
14
|
-
if (!fs.existsSync(dbPath)) {
|
|
15
|
-
console.error("Using local project data directory as fallback...");
|
|
16
|
-
dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
17
|
-
vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
18
|
-
}
|
|
19
|
-
const metadataStore = new MetadataStore(dbPath);
|
|
20
|
-
const vectorStore = new VectorStore(vectorPath);
|
|
21
|
-
const embedder = Embedder.getInstance();
|
|
22
|
-
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
23
|
-
async function run() {
|
|
24
|
-
console.log(`\n=== VERIFYING SOURCE PRIORITIZATION [Query: "${query}"] ===\n`);
|
|
25
|
-
const results = await searchEngine.search(query, { limit: 5 });
|
|
26
|
-
if (results.length === 0) {
|
|
27
|
-
console.log("No results found. Run a search that triggers JIT first!");
|
|
28
|
-
return;
|
|
29
|
-
}
|
|
30
|
-
// Print formatted results to show badges
|
|
31
|
-
console.log(formatSearchResults(results));
|
|
32
|
-
}
|
|
33
|
-
run().catch(console.error);
|
package/build/search/embedder.js
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import { pipeline, env } from "@xenova/transformers";
|
|
2
|
-
// Disable local model check to ensure it downloads if not found
|
|
3
|
-
env.allowLocalModels = false;
|
|
4
|
-
env.useBrowserCache = false;
|
|
5
|
-
export class Embedder {
|
|
6
|
-
static instance;
|
|
7
|
-
extractor = null;
|
|
8
|
-
modelName = "Xenova/paraphrase-multilingual-MiniLM-L12-v2";
|
|
9
|
-
constructor() { }
|
|
10
|
-
static getInstance() {
|
|
11
|
-
if (!Embedder.instance) {
|
|
12
|
-
Embedder.instance = new Embedder();
|
|
13
|
-
}
|
|
14
|
-
return Embedder.instance;
|
|
15
|
-
}
|
|
16
|
-
async init() {
|
|
17
|
-
if (this.extractor)
|
|
18
|
-
return;
|
|
19
|
-
console.error(`Loading embedding model: ${this.modelName}...`);
|
|
20
|
-
this.extractor = await pipeline("feature-extraction", this.modelName);
|
|
21
|
-
console.error("Embedding model loaded successfully.");
|
|
22
|
-
}
|
|
23
|
-
async embed(text) {
|
|
24
|
-
if (!this.extractor) {
|
|
25
|
-
await this.init();
|
|
26
|
-
}
|
|
27
|
-
const result = await this.extractor(text, {
|
|
28
|
-
pooling: "mean",
|
|
29
|
-
normalize: true,
|
|
30
|
-
});
|
|
31
|
-
// result.data is already a Float32Array in Xenova/transformers
|
|
32
|
-
return result.data;
|
|
33
|
-
}
|
|
34
|
-
}
|