@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { SearchEngine } from "../search/engine.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
+
import { Embedder } from "../search/embedder.js";
|
|
5
|
+
import { formatSearchResults, formatDatasetInfo } from "../tools/formatter.js";
|
|
6
|
+
import path from "path";
|
|
7
|
+
/**
|
|
8
|
+
* Demo script to showcase the new formatted UI
|
|
9
|
+
*/
|
|
10
|
+
async function main() {
|
|
11
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
12
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
13
|
+
const store = new MetadataStore(dbPath);
|
|
14
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
15
|
+
const embedder = Embedder.getInstance();
|
|
16
|
+
await embedder.init();
|
|
17
|
+
const engine = new SearchEngine(store, vectorStore, embedder);
|
|
18
|
+
console.log("\n" + "═".repeat(80));
|
|
19
|
+
console.log("VESPER UI DEMO - Formatted Search Results");
|
|
20
|
+
console.log("═".repeat(80) + "\n");
|
|
21
|
+
// Demo 1: Medical datasets
|
|
22
|
+
console.log("Demo 1: Medical Dataset Search\n");
|
|
23
|
+
const medicalResults = await engine.search("diabetes prediction machine learning", {
|
|
24
|
+
limit: 3,
|
|
25
|
+
safeOnly: true
|
|
26
|
+
});
|
|
27
|
+
console.log(formatSearchResults(medicalResults));
|
|
28
|
+
// Demo 2: Detailed dataset info
|
|
29
|
+
if (medicalResults.length > 0) {
|
|
30
|
+
console.log("\n" + "═".repeat(80));
|
|
31
|
+
console.log("Demo 2: Detailed Dataset Information");
|
|
32
|
+
console.log("═".repeat(80) + "\n");
|
|
33
|
+
console.log(formatDatasetInfo(medicalResults[0]));
|
|
34
|
+
}
|
|
35
|
+
// Demo 3: Computer Vision
|
|
36
|
+
console.log("\n" + "═".repeat(80));
|
|
37
|
+
console.log("Demo 3: Computer Vision Dataset Search");
|
|
38
|
+
console.log("═".repeat(80) + "\n");
|
|
39
|
+
const cvResults = await engine.search("image classification cats dogs", {
|
|
40
|
+
limit: 3,
|
|
41
|
+
safeOnly: true
|
|
42
|
+
});
|
|
43
|
+
console.log(formatSearchResults(cvResults));
|
|
44
|
+
// Demo 4: Show quality warnings
|
|
45
|
+
console.log("\n" + "═".repeat(80));
|
|
46
|
+
console.log("Demo 4: Niche Query with Quality Warnings");
|
|
47
|
+
console.log("═".repeat(80) + "\n");
|
|
48
|
+
const nicheResults = await engine.search("ancient manuscript text recognition", {
|
|
49
|
+
limit: 3,
|
|
50
|
+
safeOnly: true
|
|
51
|
+
});
|
|
52
|
+
console.log(formatSearchResults(nicheResults));
|
|
53
|
+
console.log("\n" + "═".repeat(80));
|
|
54
|
+
console.log("Demo Complete!");
|
|
55
|
+
console.log("═".repeat(80) + "\n");
|
|
56
|
+
store.close();
|
|
57
|
+
}
|
|
58
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
3
|
+
import { DataSplitter } from "../splitting/splitter.js";
|
|
4
|
+
import { MetadataPackager } from "../export/packager.js";
|
|
5
|
+
import { DataExporter } from "../export/exporter.js";
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
import path from "path";
|
|
8
|
+
async function main() {
|
|
9
|
+
console.log(" Vesper Data Ops Engine - End-to-End Pipeline Demo\n");
|
|
10
|
+
const sessionDir = path.join(process.cwd(), "e2e_demo_output");
|
|
11
|
+
if (fs.existsSync(sessionDir))
|
|
12
|
+
fs.rmSync(sessionDir, { recursive: true, force: true });
|
|
13
|
+
fs.mkdirSync(sessionDir);
|
|
14
|
+
const rawFile = path.join(sessionDir, "raw_data.csv");
|
|
15
|
+
// --- STEP 0: Create Raw "Dirty" Data ---
|
|
16
|
+
console.log(" Step 0: Initializing Raw Dataset...");
|
|
17
|
+
let content = "id,name,age,salary,joined_date\n";
|
|
18
|
+
content += "1,Alice,25,50000,2023-01-01\n";
|
|
19
|
+
content += "2,Bob,,60000,2023-01-05\n"; // Missing age
|
|
20
|
+
content += "3,Charlie,35,70000,2023-01-10\n";
|
|
21
|
+
content += "1,Alice,25,50000,2023-01-01\n"; // Duplicate
|
|
22
|
+
content += "4,Diana,40,invalid,2023-02-01\n"; // Type mismatch
|
|
23
|
+
fs.writeFileSync(rawFile, content);
|
|
24
|
+
// --- STEP 1: Quality Analysis ---
|
|
25
|
+
console.log(" Step 1: Running Quality Analysis...");
|
|
26
|
+
const analyzer = new QualityAnalyzer();
|
|
27
|
+
const report = await analyzer.analyze(rawFile);
|
|
28
|
+
console.log(` - Quality Score: ${report.overall_score}/100`);
|
|
29
|
+
console.log(` - Warnings Found: ${report.warnings.length}`);
|
|
30
|
+
// --- STEP 2: Auto-Cleaning ---
|
|
31
|
+
console.log(" Step 2: Generating and Executing Cleaning Pipeline...");
|
|
32
|
+
const executor = new PipelineExecutor();
|
|
33
|
+
const pipelineResult = await executor.runPipeline("demo-dataset", rawFile);
|
|
34
|
+
const cleanedFile = pipelineResult.final_output_path;
|
|
35
|
+
console.log(` - Cleaned file: ${path.basename(cleanedFile)}`);
|
|
36
|
+
// --- STEP 3: Smart Splitting ---
|
|
37
|
+
console.log(" Step 3: Splitting into Train/Val/Test/Holdout (Stratified)...");
|
|
38
|
+
const splitter = new DataSplitter();
|
|
39
|
+
const splitResult = await splitter.split(cleanedFile, {
|
|
40
|
+
type: "random", // Using random since dummy data too small for stratified
|
|
41
|
+
ratios: { train: 0.6, val: 0.2, test: 0.1, holdout: 0.1 },
|
|
42
|
+
shuffle: true,
|
|
43
|
+
random_seed: 42
|
|
44
|
+
});
|
|
45
|
+
console.log(` - Splits created: ${Object.keys(splitResult.paths).join(", ")}`);
|
|
46
|
+
// --- STEP 4: Export to Parquet ---
|
|
47
|
+
console.log(" Step 4: Exporting Final Train Set to Parquet...");
|
|
48
|
+
const exporter = new DataExporter();
|
|
49
|
+
const parquetFile = path.join(sessionDir, "train_final.parquet");
|
|
50
|
+
await exporter.export(splitResult.paths.train, parquetFile, "parquet");
|
|
51
|
+
console.log(` - Exported: ${path.basename(parquetFile)}`);
|
|
52
|
+
// --- STEP 5: Metadata Packaging ---
|
|
53
|
+
console.log(" Step 5: Creating Final Data Package...");
|
|
54
|
+
const packager = new MetadataPackager();
|
|
55
|
+
const packageResult = await packager.createPackage(path.join(sessionDir, "vesper_package"), [
|
|
56
|
+
{ path: parquetFile, format: "parquet", name: "train-set" },
|
|
57
|
+
{ path: splitResult.paths.holdout, format: "csv", name: "holdout-set" }
|
|
58
|
+
], {
|
|
59
|
+
name: "Vesper E2E Demo",
|
|
60
|
+
version: "1.0.0",
|
|
61
|
+
description: "Automatically cleaned and partitioned dataset.",
|
|
62
|
+
license: "MIT",
|
|
63
|
+
author: "Vesper Engine"
|
|
64
|
+
}, { qualityReport: report });
|
|
65
|
+
console.log(" Pipeline Finished Successfully!");
|
|
66
|
+
console.log(` Package Location: ${packageResult.packagePath}`);
|
|
67
|
+
console.log(` Manifest: datapackage.json created.`);
|
|
68
|
+
}
|
|
69
|
+
main().catch(err => {
|
|
70
|
+
console.error(" Pipeline Failed:", err);
|
|
71
|
+
process.exit(1);
|
|
72
|
+
});
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
+
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
/**
|
|
6
|
+
* Realistic massive scraper: Get maximum from HF + extensive Kaggle coverage
|
|
7
|
+
* Phase 1: Bulk Discovery (Skeleton Indexing)
|
|
8
|
+
* Phase 2: Kaggle Discovery
|
|
9
|
+
*/
|
|
10
|
+
async function main() {
|
|
11
|
+
const scraper = new HuggingFaceScraper();
|
|
12
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
13
|
+
const store = new MetadataStore(dbPath);
|
|
14
|
+
const allDatasets = new Map();
|
|
15
|
+
try {
|
|
16
|
+
// Load existing datasets to avoid duplicates
|
|
17
|
+
console.error(`\nLoading existing datasets from database...`);
|
|
18
|
+
const existing = store.getAllDatasets();
|
|
19
|
+
for (const ds of existing) {
|
|
20
|
+
allDatasets.set(ds.id, ds);
|
|
21
|
+
}
|
|
22
|
+
console.error(`Found ${existing.length} existing datasets in database`);
|
|
23
|
+
// Phase 1: Bulk discovery from HuggingFace
|
|
24
|
+
console.error(`\nPhase 1: Bulk HuggingFace Discovery (Target: 30,000)`);
|
|
25
|
+
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
26
|
+
if (hfToken) {
|
|
27
|
+
console.error(`Using HuggingFace token (rate limits should be higher)`);
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
console.error(`WARNING: No HF_TOKEN found. Bulk scraping may be slower.`);
|
|
31
|
+
}
|
|
32
|
+
const hfLimit = 30000;
|
|
33
|
+
const hfDatasets = await scraper.scrapeBulk(hfLimit);
|
|
34
|
+
let newHfCount = 0;
|
|
35
|
+
for (const ds of hfDatasets) {
|
|
36
|
+
if (!allDatasets.has(ds.id)) {
|
|
37
|
+
allDatasets.set(ds.id, ds);
|
|
38
|
+
newHfCount++;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
console.error(`HuggingFace Bulk Discovery: ${newHfCount} new datasets (${allDatasets.size} total unique)`);
|
|
42
|
+
// Save HF progress
|
|
43
|
+
console.error(`Saving HF discovery results to database...`);
|
|
44
|
+
store.beginTransaction();
|
|
45
|
+
try {
|
|
46
|
+
for (const ds of hfDatasets) {
|
|
47
|
+
store.saveDataset(ds);
|
|
48
|
+
}
|
|
49
|
+
store.commit();
|
|
50
|
+
}
|
|
51
|
+
catch (e) {
|
|
52
|
+
store.rollback();
|
|
53
|
+
console.error("Failed to save HF discovery progress:", e);
|
|
54
|
+
}
|
|
55
|
+
// Phase 2: Extensive Kaggle scraping across many categories
|
|
56
|
+
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
57
|
+
const kaggleKey = process.env.KAGGLE_KEY;
|
|
58
|
+
if (kaggleUser && kaggleKey) {
|
|
59
|
+
console.error(`\nPhase 2: Extensive Kaggle scraping`);
|
|
60
|
+
const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
61
|
+
// Comprehensive Kaggle search terms
|
|
62
|
+
const kaggleSearches = [
|
|
63
|
+
"machine learning", "deep learning", "data science",
|
|
64
|
+
"classification", "regression", "clustering", "anomaly detection",
|
|
65
|
+
"natural language processing", "text classification", "sentiment analysis",
|
|
66
|
+
"image classification", "object detection", "medical imaging",
|
|
67
|
+
"time series", "forecasting", "financial forecasting",
|
|
68
|
+
"healthcare", "medical diagnosis", "e-commerce", "social media"
|
|
69
|
+
];
|
|
70
|
+
for (const search of kaggleSearches) {
|
|
71
|
+
console.error(` [Kaggle: "${search}"] Fetching...`);
|
|
72
|
+
try {
|
|
73
|
+
const kaggleDatasets = await kaggleScraper.scrape(search, 100, true);
|
|
74
|
+
let newKaggleCount = 0;
|
|
75
|
+
for (const ds of kaggleDatasets) {
|
|
76
|
+
ds.id = `kaggle:${ds.id}`;
|
|
77
|
+
if (!allDatasets.has(ds.id)) {
|
|
78
|
+
allDatasets.set(ds.id, ds);
|
|
79
|
+
newKaggleCount++;
|
|
80
|
+
store.saveDataset(ds); // Save individually to avoid long transactions
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
console.error(` ${newKaggleCount} new datasets (${kaggleDatasets.length} total fetched, ${allDatasets.size} total unique)`);
|
|
84
|
+
}
|
|
85
|
+
catch (e) {
|
|
86
|
+
console.error(` ERROR: ${e.message}`);
|
|
87
|
+
}
|
|
88
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
console.error(`\nBulk Discovery Complete!`);
|
|
92
|
+
console.error(`Total unique datasets in library: ${allDatasets.size}`);
|
|
93
|
+
console.error(`\nNext step: Run 'npm run index' to update vectors.`);
|
|
94
|
+
}
|
|
95
|
+
catch (error) {
|
|
96
|
+
console.error("\nERROR: Massive scraping failed:", error);
|
|
97
|
+
process.exit(1);
|
|
98
|
+
}
|
|
99
|
+
finally {
|
|
100
|
+
store.close();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
main();
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { JobManager } from "../jobs/manager.js";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function showDashboard() {
|
|
5
|
+
const dbPath = path.resolve("data", "vesper.db");
|
|
6
|
+
const store = new MetadataStore(dbPath);
|
|
7
|
+
// In a real app, this would be a shared instance in a long-running process
|
|
8
|
+
const manager = JobManager.getInstance(store);
|
|
9
|
+
const stats = manager.stats;
|
|
10
|
+
console.clear();
|
|
11
|
+
console.log("==========================================");
|
|
12
|
+
console.log(" VESPER OPERATIONS DASHBOARD ");
|
|
13
|
+
console.log("==========================================\n");
|
|
14
|
+
const prometheus = stats.getPrometheusMetrics();
|
|
15
|
+
const summary = stats.getStats();
|
|
16
|
+
if (Object.keys(summary).length === 0) {
|
|
17
|
+
console.log("No metrics recorded yet. Process some jobs to see data.");
|
|
18
|
+
}
|
|
19
|
+
else {
|
|
20
|
+
console.log("--- SYSTEM METRICS (JSON) ---");
|
|
21
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
22
|
+
console.log("\n--- PROMETHEUS EXPORT (RAW) ---");
|
|
23
|
+
console.log(prometheus);
|
|
24
|
+
}
|
|
25
|
+
console.log("\n==========================================");
|
|
26
|
+
console.log(" Press Ctrl+C to exit dashboard loop ");
|
|
27
|
+
console.log("==========================================");
|
|
28
|
+
store.close();
|
|
29
|
+
}
|
|
30
|
+
// Simple loop for "real-time" feel (simulated)
|
|
31
|
+
console.log("Starting Dashboard...");
|
|
32
|
+
showDashboard();
|
|
33
|
+
setInterval(showDashboard, 5000);
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
+
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
async function main() {
|
|
6
|
+
const scraper = new HuggingFaceScraper();
|
|
7
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
8
|
+
const store = new MetadataStore(dbPath);
|
|
9
|
+
// Get limit from command line args or default to 100
|
|
10
|
+
let limit = 100;
|
|
11
|
+
if (process.argv[2] && !process.argv[2].startsWith("-")) {
|
|
12
|
+
limit = parseInt(process.argv[2], 10);
|
|
13
|
+
if (isNaN(limit))
|
|
14
|
+
limit = 100;
|
|
15
|
+
}
|
|
16
|
+
// Check if MVP filters should be disabled (--no-mvp flag)
|
|
17
|
+
const applyMVPFilters = !process.argv.includes("--no-mvp");
|
|
18
|
+
// Get domain filter if specified (--domain medicine, --domain healthcare, etc.)
|
|
19
|
+
const domainArgIndex = process.argv.indexOf("--domain");
|
|
20
|
+
const domainFilter = domainArgIndex !== -1 && process.argv[domainArgIndex + 1]
|
|
21
|
+
? process.argv[domainArgIndex + 1]
|
|
22
|
+
: undefined;
|
|
23
|
+
try {
|
|
24
|
+
const datasets = [];
|
|
25
|
+
const domainMsg = domainFilter ? `, domain: ${domainFilter}` : "";
|
|
26
|
+
// 1. HuggingFace
|
|
27
|
+
const hfLimit = limit > 1000 ? limit : 1000; // Aim for at least 1k for "massive"
|
|
28
|
+
console.error(`Scraping HF: ${hfLimit} datasets with MVP filters: ${applyMVPFilters}${domainMsg}`);
|
|
29
|
+
const hfDatasets = await scraper.scrape(hfLimit, applyMVPFilters, domainFilter);
|
|
30
|
+
datasets.push(...hfDatasets);
|
|
31
|
+
// 2. Kaggle (Optional - only if credentials provided)
|
|
32
|
+
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
33
|
+
const kaggleKey = process.env.KAGGLE_KEY;
|
|
34
|
+
if (kaggleUser && kaggleKey) {
|
|
35
|
+
const kaggleLimit = Math.max(100, Math.floor(hfLimit / 4)); // Balance HF/Kaggle ratio
|
|
36
|
+
console.error(`\nScraping Kaggle: searching for "${domainFilter || 'all'}" (limit: ${kaggleLimit})`);
|
|
37
|
+
const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
38
|
+
// For massive scraping, we might want to iterate through multiple common search terms if no domainFilter is set
|
|
39
|
+
const searchTerms = domainFilter ? [domainFilter] : ["machine learning", "data science", "nlp", "computer vision", "healthcare"];
|
|
40
|
+
const termLimit = Math.floor(kaggleLimit / searchTerms.length);
|
|
41
|
+
for (const term of searchTerms) {
|
|
42
|
+
console.error(`[Kaggle] Fetching "${term}"...`);
|
|
43
|
+
const kaggleDatasets = await kaggleScraper.scrape(term, termLimit);
|
|
44
|
+
kaggleDatasets.forEach(d => {
|
|
45
|
+
d.id = `kaggle:${d.id}`;
|
|
46
|
+
if (!datasets.some(existing => existing.id === d.id)) {
|
|
47
|
+
datasets.push(d);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
console.error("\n[Kaggle] Skipping (KAGGLE_USERNAME/KAGGLE_KEY not set).");
|
|
54
|
+
}
|
|
55
|
+
console.error(`\nTotal Scraped: ${datasets.length} datasets.`);
|
|
56
|
+
let saved = 0;
|
|
57
|
+
store.beginTransaction(); // Performance optimization for large batch
|
|
58
|
+
try {
|
|
59
|
+
for (const ds of datasets) {
|
|
60
|
+
store.saveDataset(ds);
|
|
61
|
+
saved++;
|
|
62
|
+
if (saved % 500 === 0)
|
|
63
|
+
console.error(`[DB] Saved ${saved} records...`);
|
|
64
|
+
}
|
|
65
|
+
store.commit();
|
|
66
|
+
}
|
|
67
|
+
catch (e) {
|
|
68
|
+
store.rollback();
|
|
69
|
+
throw e;
|
|
70
|
+
}
|
|
71
|
+
console.error(`Successfully saved ${saved} datasets to MetadataStore.`);
|
|
72
|
+
console.error(`\nSummary:`);
|
|
73
|
+
console.error(` - Total examples: ${datasets.reduce((sum, d) => sum + d.total_examples, 0).toLocaleString()}`);
|
|
74
|
+
console.error(` - Safe sources: ${datasets.filter(d => d.is_safe_source).length}`);
|
|
75
|
+
console.error(` - Structured datasets: ${datasets.filter(d => d.is_structured).length}`);
|
|
76
|
+
console.error(` - Permissive licenses: ${datasets.filter(d => d.license.category === "safe").length}`);
|
|
77
|
+
// Show domain distribution
|
|
78
|
+
const domainCounts = new Map();
|
|
79
|
+
datasets.forEach(d => {
|
|
80
|
+
const domain = d.domain || "unknown";
|
|
81
|
+
domainCounts.set(domain, (domainCounts.get(domain) || 0) + 1);
|
|
82
|
+
});
|
|
83
|
+
if (domainCounts.size > 0) {
|
|
84
|
+
console.error(` - Domains:`);
|
|
85
|
+
Array.from(domainCounts.entries())
|
|
86
|
+
.sort((a, b) => b[1] - a[1])
|
|
87
|
+
.forEach(([domain, count]) => {
|
|
88
|
+
console.error(` ${domain}: ${count}`);
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
console.error("Scraping failed:", error);
|
|
94
|
+
process.exit(1);
|
|
95
|
+
}
|
|
96
|
+
finally {
|
|
97
|
+
store.close();
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
main();
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
+
import { Embedder } from "../search/embedder.js";
|
|
5
|
+
import { SearchEngine } from "../search/engine.js";
|
|
6
|
+
const query = process.argv[2];
|
|
7
|
+
if (!query) {
|
|
8
|
+
console.error("Usage: npx tsx src/scripts/search-cli.ts \"your query\"");
|
|
9
|
+
process.exit(1);
|
|
10
|
+
}
|
|
11
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
12
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
13
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
14
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
15
|
+
const embedder = Embedder.getInstance();
|
|
16
|
+
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
17
|
+
async function run() {
|
|
18
|
+
console.error(`Searching for: "${query}"...`);
|
|
19
|
+
const results = await searchEngine.search(query, { limit: 5, safeOnly: true });
|
|
20
|
+
if (results.length === 0) {
|
|
21
|
+
console.log("No results found.");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
console.log(JSON.stringify(results, null, 2));
|
|
25
|
+
}
|
|
26
|
+
run().catch(console.error);
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Bias Analysis Test ===\n");
|
|
6
|
+
const analyzer = new QualityAnalyzer();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_bias.csv");
|
|
8
|
+
// Create a CSV with severe class imbalance
|
|
9
|
+
// 'label': 90% 'A', 10% 'B' -> Should trigger warning
|
|
10
|
+
// 'gender': 50/50 -> No warning
|
|
11
|
+
let csvContent = "id,gender,label\n";
|
|
12
|
+
for (let i = 0; i < 95; i++)
|
|
13
|
+
csvContent += `${i},M,A\n`;
|
|
14
|
+
for (let i = 95; i < 100; i++)
|
|
15
|
+
csvContent += `${i},F,B\n`;
|
|
16
|
+
fs.writeFileSync(testFile, csvContent);
|
|
17
|
+
console.log(`Created test file: ${testFile}`);
|
|
18
|
+
console.log("Expectations:");
|
|
19
|
+
console.log(" - Should warn about 'Severe imbalance' in 'label' column (95% A)");
|
|
20
|
+
console.log(" - Should NOT warn about 'gender' (skewed but just a test sample)");
|
|
21
|
+
try {
|
|
22
|
+
console.log("\nRunning analyzer...");
|
|
23
|
+
const report = await analyzer.analyze(testFile);
|
|
24
|
+
console.log("\n=== Bias Report ===");
|
|
25
|
+
if (report.class_imbalance_warnings && report.class_imbalance_warnings.length > 0) {
|
|
26
|
+
report.class_imbalance_warnings.forEach(w => console.log(`[!] ${w}`));
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
console.log("No bias detected.");
|
|
30
|
+
}
|
|
31
|
+
const labelWarning = report.class_imbalance_warnings?.find(w => w.includes("'label'"));
|
|
32
|
+
if (labelWarning) {
|
|
33
|
+
console.log("\nTest passed! Detected imbalance.");
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
console.error("\nTest failed! Did not detect imbalance.");
|
|
37
|
+
}
|
|
38
|
+
// Cleanup
|
|
39
|
+
fs.unlinkSync(testFile);
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
console.error("\nTest failed:", error);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { MockRedisProvider, CacheService } from "../cache/service.js";
|
|
2
|
+
import { CDNService } from "../cache/cdn.js";
|
|
3
|
+
import { CleaningPlanner } from "../cleaning/planner.js";
|
|
4
|
+
async function runTest() {
|
|
5
|
+
console.log("--- Initializing Caching Layer Test ---");
|
|
6
|
+
const cacheProvider = new MockRedisProvider();
|
|
7
|
+
const cache = new CacheService(cacheProvider);
|
|
8
|
+
const planner = new CleaningPlanner(cache);
|
|
9
|
+
const cdn = new CDNService("data/cdn_mock", "https://cdn.vesper.ai");
|
|
10
|
+
const dsId = "test/cache-dataset";
|
|
11
|
+
const mockReport = {
|
|
12
|
+
row_count: 1000,
|
|
13
|
+
column_count: 2,
|
|
14
|
+
duplicate_rows: 5,
|
|
15
|
+
duplicate_percentage: 0.5,
|
|
16
|
+
columns: [
|
|
17
|
+
{ name: "col1", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false }
|
|
18
|
+
],
|
|
19
|
+
warnings: [],
|
|
20
|
+
schema_warnings: [],
|
|
21
|
+
overall_score: 80
|
|
22
|
+
};
|
|
23
|
+
// 1. Test Planning Cache
|
|
24
|
+
console.log("\n--- Testing Cleaning Plan Cache ---");
|
|
25
|
+
console.log("First Run (Cache Miss)...");
|
|
26
|
+
const start1 = Date.now();
|
|
27
|
+
await planner.generatePlan(dsId, mockReport);
|
|
28
|
+
console.log(`First run took ${Date.now() - start1}ms`);
|
|
29
|
+
console.log("Second Run (Cache Hit)...");
|
|
30
|
+
const start2 = Date.now();
|
|
31
|
+
await planner.generatePlan(dsId, mockReport);
|
|
32
|
+
console.log(`Second run took ${Date.now() - start2}ms (Expected to be near 0ms)`);
|
|
33
|
+
// 2. Test TTL / Hash mismatch
|
|
34
|
+
console.log("\n--- Testing Cache Sensitivity (Config Change) ---");
|
|
35
|
+
console.log("Running with a ruleSet (New Cache Key)...");
|
|
36
|
+
const start3 = Date.now();
|
|
37
|
+
await planner.generatePlan(dsId, mockReport, { id: "rules-1", name: "Custom", rules: [] });
|
|
38
|
+
console.log(`Config change run took ${Date.now() - start3}ms (Cache Miss)`);
|
|
39
|
+
// 3. Test CDN Upload
|
|
40
|
+
console.log("\n--- Testing CDN Mock ---");
|
|
41
|
+
const reportContent = JSON.stringify(mockReport, null, 2);
|
|
42
|
+
const url = await cdn.upload("report_latest.json", reportContent);
|
|
43
|
+
console.log(`Report uploaded to CDN: ${url}`);
|
|
44
|
+
if (url.startsWith("https://cdn.vesper.ai")) {
|
|
45
|
+
console.log("\n✅ Success: Caching and CDN layers verified.");
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
console.error("\n❌ Failure: CDN URL generation mismatch.");
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { DataCleaner } from "../cleaning/cleaner.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Auto-Cleaning Pipeline Test ===\n");
|
|
6
|
+
const cleaner = new DataCleaner();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_cleaning.csv");
|
|
8
|
+
// Create Dirty Data
|
|
9
|
+
// - id: Duplicate rows (1)
|
|
10
|
+
// - age: Missing values, Outlier (200), Wrong Type ("thirty")
|
|
11
|
+
// - score: Good float
|
|
12
|
+
// - unnecessary: Column to drop
|
|
13
|
+
const csvContent = `id,age,score,unnecessary
|
|
14
|
+
1,25,88.5,trash
|
|
15
|
+
2,,92.0,trash
|
|
16
|
+
3,200,15.0,trash
|
|
17
|
+
1,25,88.5,trash
|
|
18
|
+
4,"30",80.0,trash`;
|
|
19
|
+
fs.writeFileSync(testFile, csvContent);
|
|
20
|
+
console.log(`Created dirty file: ${testFile}`);
|
|
21
|
+
// Define Cleaning Plan
|
|
22
|
+
const ops = [
|
|
23
|
+
{
|
|
24
|
+
type: "DropColumns",
|
|
25
|
+
params: { columns: ["unnecessary"] },
|
|
26
|
+
reason: "Not useful"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
type: "RemoveDuplicates",
|
|
30
|
+
params: {},
|
|
31
|
+
reason: "Duplicate rows"
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
type: "FillMissing",
|
|
35
|
+
params: { column: "age", method: "constant", value: 0 },
|
|
36
|
+
reason: "Impute missing age"
|
|
37
|
+
},
|
|
38
|
+
// Note: Polars can't easily fix "thirty" to 30 automatically without specific logic,
|
|
39
|
+
// so we just cast what we can. "30" string -> 30 int works.
|
|
40
|
+
{
|
|
41
|
+
type: "FixTypes",
|
|
42
|
+
params: { column: "age", type: "float" },
|
|
43
|
+
reason: "Convert age to number"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
type: "RemoveOutliers",
|
|
47
|
+
params: { column: "age", method: "iqr", threshold: 1.5 },
|
|
48
|
+
reason: "Remove age > 100"
|
|
49
|
+
}
|
|
50
|
+
];
|
|
51
|
+
try {
|
|
52
|
+
console.log("Executing cleaning plan...");
|
|
53
|
+
const result = await cleaner.clean(testFile, ops);
|
|
54
|
+
console.log("\n=== Cleaning Result ===");
|
|
55
|
+
console.log(`Success: ${result.success}`);
|
|
56
|
+
console.log(`Output: ${result.output_path || "None"}`);
|
|
57
|
+
console.log(`Rows Affected: ${result.rows_affected}`);
|
|
58
|
+
console.log("\nLogs:");
|
|
59
|
+
result.logs.forEach(l => console.log(` - ${l}`));
|
|
60
|
+
// Validate Output File Exists
|
|
61
|
+
if (fs.existsSync(result.output_path)) {
|
|
62
|
+
console.log("\nConverted file created successfully.");
|
|
63
|
+
// Cleanup
|
|
64
|
+
fs.unlinkSync(testFile);
|
|
65
|
+
fs.unlinkSync(result.output_path);
|
|
66
|
+
console.log("Test passed!");
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
console.error("\nTest failed! Output file missing.");
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
console.error("\nTest failed:", error);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { StorageManager } from "../cloud/storage-manager.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Cloud Storage Test ===\n");
|
|
6
|
+
const vaultDir = path.join(process.cwd(), "test_vault");
|
|
7
|
+
const adapter = StorageManager.createAdapter({
|
|
8
|
+
type: "local",
|
|
9
|
+
options: { basePath: vaultDir }
|
|
10
|
+
});
|
|
11
|
+
const testFile = "test_upload.txt";
|
|
12
|
+
fs.writeFileSync(testFile, "Hello Cloud Storage!");
|
|
13
|
+
try {
|
|
14
|
+
console.log("Testing upload...");
|
|
15
|
+
const remotePath = "datasets/v1/test.txt";
|
|
16
|
+
const url = await adapter.upload(testFile, remotePath);
|
|
17
|
+
console.log("Upload URL:", url);
|
|
18
|
+
const expectedFile = path.join(vaultDir, remotePath);
|
|
19
|
+
if (fs.existsSync(expectedFile)) {
|
|
20
|
+
console.log("PASS: File successfully uploaded to local vault.");
|
|
21
|
+
const content = fs.readFileSync(expectedFile, "utf8");
|
|
22
|
+
if (content === "Hello Cloud Storage!") {
|
|
23
|
+
console.log("PASS: Content integrity verified.");
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
console.error("FAIL: 0Uploaded file not found in vault.");
|
|
28
|
+
}
|
|
29
|
+
console.log("\nTesting signed URL...");
|
|
30
|
+
const signedUrl = await adapter.getSignedUrl(remotePath);
|
|
31
|
+
console.log("Signed URL:", signedUrl);
|
|
32
|
+
console.log("\nTesting deletion...");
|
|
33
|
+
await adapter.delete(remotePath);
|
|
34
|
+
if (!fs.existsSync(expectedFile)) {
|
|
35
|
+
console.log("PASS: File successfully deleted from vault.");
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
catch (e) {
|
|
39
|
+
console.error("FAIL: Cloud storage test failed:", e);
|
|
40
|
+
}
|
|
41
|
+
finally {
|
|
42
|
+
if (fs.existsSync(testFile))
|
|
43
|
+
fs.unlinkSync(testFile);
|
|
44
|
+
if (fs.existsSync(vaultDir))
|
|
45
|
+
fs.rmSync(vaultDir, { recursive: true, force: true });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
main().catch(console.error);
|