@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { ComplianceStore } from "../compliance/store.js";
|
|
3
|
+
import { ComplianceService } from "../compliance/service.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import fs from "fs";
|
|
6
|
+
async function runTest() {
|
|
7
|
+
const dbPath = path.resolve("data", "test-compliance.db");
|
|
8
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
9
|
+
const complianceStore = new ComplianceStore(metadataStore.db);
|
|
10
|
+
const service = new ComplianceService(complianceStore);
|
|
11
|
+
const dsId = "org/health-data";
|
|
12
|
+
const medDataset = {
|
|
13
|
+
id: dsId,
|
|
14
|
+
source: "huggingface",
|
|
15
|
+
name: "Patient Records (Mock)",
|
|
16
|
+
domain: "healthcare",
|
|
17
|
+
has_personal_data: true,
|
|
18
|
+
quality_warnings: ["Potential PII detected in column 'patient_name'"],
|
|
19
|
+
last_updated: new Date().toISOString(),
|
|
20
|
+
license: { id: "proprietary", category: "restricted", usage_restrictions: [], warnings: [] },
|
|
21
|
+
// ... rest
|
|
22
|
+
};
|
|
23
|
+
console.log("--- Initial Compliance Check (Expecting Failure) ---");
|
|
24
|
+
const gdpr1 = await service.verifyGDPR(medDataset);
|
|
25
|
+
console.log(`GDPR Passed: ${gdpr1.passed}`);
|
|
26
|
+
console.log("Issues:", JSON.stringify(gdpr1.issues, null, 2));
|
|
27
|
+
const hipaa1 = await service.verifyHIPAA(medDataset);
|
|
28
|
+
console.log(`HIPAA Passed: ${hipaa1.passed}`);
|
|
29
|
+
console.log("Issues:", JSON.stringify(hipaa1.issues, null, 2));
|
|
30
|
+
console.log("\n--- Logging Operations (Audit Trail) ---");
|
|
31
|
+
service.logOperation("admin-123", dsId, "ComplianceCheck", { result: "Failed" });
|
|
32
|
+
service.logOperation("admin-123", dsId, "Clean", { rules: "PII-Masking" });
|
|
33
|
+
console.log("\n--- Resolving Compliance Issues ---");
|
|
34
|
+
complianceStore.saveConsent({
|
|
35
|
+
dataset_id: dsId,
|
|
36
|
+
consent_obtained: true,
|
|
37
|
+
source: "patient-portal",
|
|
38
|
+
last_verified: new Date().toISOString()
|
|
39
|
+
});
|
|
40
|
+
// Simulate de-identification
|
|
41
|
+
const cleanDataset = { ...medDataset, has_personal_data: false, quality_warnings: [] };
|
|
42
|
+
const gdpr2 = await service.verifyGDPR(cleanDataset);
|
|
43
|
+
console.log(`GDPR Passed (After Fix): ${gdpr2.passed}`);
|
|
44
|
+
const hipaa2 = await service.verifyHIPAA(cleanDataset);
|
|
45
|
+
console.log(`HIPAA Passed (After Fix): ${hipaa2.passed}`);
|
|
46
|
+
console.log("\n--- Exporting Audit Log ---");
|
|
47
|
+
const csv = service.exportAuditLog(dsId);
|
|
48
|
+
fs.writeFileSync("compliance-audit.csv", csv);
|
|
49
|
+
console.log("Audit log saved to compliance-audit.csv");
|
|
50
|
+
console.log("Log Snippet:\n", csv.split("\n").slice(0, 3).join("\n"));
|
|
51
|
+
if (gdpr2.passed && hipaa2.passed && csv.includes("Clean")) {
|
|
52
|
+
console.log("\n✅ Success: Compliance checks and audit trail verified.");
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
console.error("\n❌ Failure: Compliance workflow incomplete.");
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
3
|
+
import { InstallService } from "../install/install-service.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import fs from "fs";
|
|
6
|
+
async function runTest() {
|
|
7
|
+
console.log("--- Testing Format Conversion Export ---");
|
|
8
|
+
const projectRoot = path.resolve(".");
|
|
9
|
+
const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
|
|
10
|
+
const pipelineExecutor = new PipelineExecutor(projectRoot);
|
|
11
|
+
const installService = new InstallService(projectRoot, metadataStore);
|
|
12
|
+
// 1. Create a mock JSON dataset
|
|
13
|
+
const datasetId = "test/json-dataset";
|
|
14
|
+
const rawDir = path.join(projectRoot, "data", "raw");
|
|
15
|
+
if (!fs.existsSync(rawDir))
|
|
16
|
+
fs.mkdirSync(rawDir, { recursive: true });
|
|
17
|
+
const jsonPath = path.join(rawDir, "test_data.json");
|
|
18
|
+
const testData = [
|
|
19
|
+
{ id: 1, name: "Alice", value: 10.5 },
|
|
20
|
+
{ id: 2, name: "Bob", value: 20.1 }
|
|
21
|
+
];
|
|
22
|
+
fs.writeFileSync(jsonPath, JSON.stringify(testData));
|
|
23
|
+
// Register in local_files
|
|
24
|
+
metadataStore.registerDownload(datasetId, jsonPath, "completed", fs.statSync(jsonPath).size);
|
|
25
|
+
// Register in datasets to satisfy inner check
|
|
26
|
+
metadataStore.saveDataset({
|
|
27
|
+
id: datasetId,
|
|
28
|
+
source: "huggingface",
|
|
29
|
+
name: "Test JSON",
|
|
30
|
+
description: "A test JSON file",
|
|
31
|
+
license: { category: "safe", id: "mit", usage_restrictions: [], warnings: [] },
|
|
32
|
+
last_updated: new Date().toISOString(),
|
|
33
|
+
quality_score: 80,
|
|
34
|
+
download_url: "http://example.com",
|
|
35
|
+
quality_warnings: []
|
|
36
|
+
});
|
|
37
|
+
console.log("Mock dataset registered.");
|
|
38
|
+
// 2. Simulate export_dataset with format="csv"
|
|
39
|
+
console.log(`Converting ${jsonPath} to CSV...`);
|
|
40
|
+
const downloadStatus = metadataStore.getDownloadStatus(datasetId);
|
|
41
|
+
if (!downloadStatus)
|
|
42
|
+
throw new Error("Dataset not found in DB");
|
|
43
|
+
let sourcePath = downloadStatus.local_path;
|
|
44
|
+
const requestedFormat = "csv";
|
|
45
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
46
|
+
if (currentExt !== requestedFormat) {
|
|
47
|
+
console.log(`Mismatch detected. Running pipeline conversion...`);
|
|
48
|
+
const result = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
|
|
49
|
+
sourcePath = result.final_output_path;
|
|
50
|
+
}
|
|
51
|
+
// 3. Install
|
|
52
|
+
const targetDir = path.join(projectRoot, "test-conversion-export");
|
|
53
|
+
const finalPath = await installService.install(datasetId, sourcePath, targetDir);
|
|
54
|
+
console.log(`✅ Final Export Path: ${finalPath}`);
|
|
55
|
+
if (finalPath.endsWith(".csv") && fs.existsSync(finalPath)) {
|
|
56
|
+
const content = fs.readFileSync(finalPath, "utf8");
|
|
57
|
+
console.log("CSV Content Preview:\n" + content);
|
|
58
|
+
console.log("\nVERIFICATION_STATUS: ✅ PASS");
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
console.error("\nVERIFICATION_STATUS: ❌ FAIL - Result is not a CSV or file missing");
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { CleaningPlanner } from "../cleaning/planner.js";
|
|
2
|
+
import { NLP_PRESET, HEALTHCARE_PRESET } from "../cleaning/rules.js";
|
|
3
|
+
import { RuleEvaluator } from "../cleaning/evaluator.js";
|
|
4
|
+
async function runTest() {
|
|
5
|
+
const planner = new CleaningPlanner();
|
|
6
|
+
const evaluator = new RuleEvaluator();
|
|
7
|
+
const mockReport = {
|
|
8
|
+
row_count: 1000,
|
|
9
|
+
column_count: 3,
|
|
10
|
+
duplicate_rows: 0,
|
|
11
|
+
duplicate_percentage: 0,
|
|
12
|
+
columns: [
|
|
13
|
+
{ name: "text", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false },
|
|
14
|
+
{ name: "email", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false },
|
|
15
|
+
{ name: "date", type: "Utf8", inferred_type: "Date", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false }
|
|
16
|
+
],
|
|
17
|
+
warnings: [],
|
|
18
|
+
schema_warnings: [],
|
|
19
|
+
overall_score: 90
|
|
20
|
+
};
|
|
21
|
+
console.log("--- Testing NLP Preset ---");
|
|
22
|
+
const nlpPlan = await planner.generatePlan("nlp-ds", mockReport, NLP_PRESET);
|
|
23
|
+
console.log(`Plan generated with ${nlpPlan.operations.length} operations.`);
|
|
24
|
+
nlpPlan.operations.forEach((op) => console.log(`- [${op.type}] ${op.reason}`));
|
|
25
|
+
console.log("\n--- Testing Healthcare Preset ---");
|
|
26
|
+
const hcPlan = await planner.generatePlan("hc-ds", mockReport, HEALTHCARE_PRESET);
|
|
27
|
+
console.log(`Plan generated with ${hcPlan.operations.length} operations.`);
|
|
28
|
+
hcPlan.operations.forEach((op) => console.log(`- [${op.type}] ${op.reason}`));
|
|
29
|
+
console.log("\n--- Testing RuleEvaluator Logic ---");
|
|
30
|
+
const record = {
|
|
31
|
+
text: "Check out https://google.com",
|
|
32
|
+
email: "user@example.com",
|
|
33
|
+
date: "2023-01-01"
|
|
34
|
+
};
|
|
35
|
+
console.log("Original Record:", JSON.stringify(record));
|
|
36
|
+
let processed = { ...record };
|
|
37
|
+
// Apply NLP Rules
|
|
38
|
+
for (const rule of NLP_PRESET.rules) {
|
|
39
|
+
if (evaluator.matches(processed, rule.condition)) {
|
|
40
|
+
processed = evaluator.apply(processed, rule);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Apply Healthcare Rules
|
|
44
|
+
for (const rule of HEALTHCARE_PRESET.rules) {
|
|
45
|
+
if (evaluator.matches(processed, rule.condition)) {
|
|
46
|
+
processed = evaluator.apply(processed, rule);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
console.log("Processed Record:", JSON.stringify(processed));
|
|
50
|
+
if (processed.text === "check out " && processed.email.includes("...")) {
|
|
51
|
+
console.log("\n✅ Success: Rules correctly applied and record transformed.");
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
console.error("\n❌ Failure: Record transformation mismatch.");
|
|
55
|
+
console.log("Expected text to be lowercased and URL removed, and email to be masked.");
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
async function runTest() {
|
|
3
|
+
console.log("--- Initializing Database Optimization Test ---");
|
|
4
|
+
const store = new MetadataStore("data/vesper_test_opt.db");
|
|
5
|
+
// 1. Check if jobs_archive table exists
|
|
6
|
+
console.log("Verifying tables and indexes...");
|
|
7
|
+
// 2. Insert mock jobs (some old, some new)
|
|
8
|
+
const now = new Date();
|
|
9
|
+
const oldDate = new Date();
|
|
10
|
+
oldDate.setDate(now.getDate() - 40); // 40 days ago
|
|
11
|
+
const jobs = [
|
|
12
|
+
{
|
|
13
|
+
id: "job-new-1",
|
|
14
|
+
type: "clean",
|
|
15
|
+
status: "completed",
|
|
16
|
+
priority: 0,
|
|
17
|
+
progress: 100,
|
|
18
|
+
status_text: "Done",
|
|
19
|
+
attempts: 1,
|
|
20
|
+
max_attempts: 3,
|
|
21
|
+
created_at: now.toISOString(),
|
|
22
|
+
updated_at: now.toISOString()
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
id: "job-old-1",
|
|
26
|
+
type: "prepare",
|
|
27
|
+
status: "completed",
|
|
28
|
+
priority: 0,
|
|
29
|
+
progress: 100,
|
|
30
|
+
status_text: "Archivable",
|
|
31
|
+
attempts: 1,
|
|
32
|
+
max_attempts: 3,
|
|
33
|
+
created_at: oldDate.toISOString(),
|
|
34
|
+
updated_at: oldDate.toISOString()
|
|
35
|
+
}
|
|
36
|
+
];
|
|
37
|
+
console.log("Inserting mock jobs...");
|
|
38
|
+
jobs.forEach(j => store.saveJob(j));
|
|
39
|
+
// 3. Test Archiving
|
|
40
|
+
console.log("\n--- Testing Archiving Logic (cutoff 30 days) ---");
|
|
41
|
+
const archivedCount = store.archiveOldJobs(30);
|
|
42
|
+
console.log(`Archived ${archivedCount} jobs (indicator).`);
|
|
43
|
+
// Verify
|
|
44
|
+
const newJob = store.getJob("job-new-1");
|
|
45
|
+
const oldJob = store.getJob("job-old-1");
|
|
46
|
+
if (newJob)
|
|
47
|
+
console.log("✅ New job remains in active table.");
|
|
48
|
+
if (!oldJob)
|
|
49
|
+
console.log("✅ Old job removed from active table.");
|
|
50
|
+
// Manually check archive via query if possible (not exposed, but we can check if it failed)
|
|
51
|
+
// 4. Test Optimization
|
|
52
|
+
console.log("\n--- Testing Maintenance Methods ---");
|
|
53
|
+
try {
|
|
54
|
+
store.optimize();
|
|
55
|
+
console.log("✅ Vacuum and Analyze completed.");
|
|
56
|
+
}
|
|
57
|
+
catch (e) {
|
|
58
|
+
console.error("❌ Optimization failed:", e);
|
|
59
|
+
}
|
|
60
|
+
console.log("\n✅ Success: Database optimizations verified.");
|
|
61
|
+
store.close();
|
|
62
|
+
}
|
|
63
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { InstallService } from "../install/install-service.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import fs from "fs";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Testing Custom Export Path ---");
|
|
7
|
+
const projectRoot = path.resolve(".");
|
|
8
|
+
const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
|
|
9
|
+
const installService = new InstallService(projectRoot, metadataStore);
|
|
10
|
+
// Find a naruto dataset
|
|
11
|
+
const dataset = metadataStore.db.prepare("SELECT * FROM datasets WHERE name LIKE '%naruto%' LIMIT 1").get();
|
|
12
|
+
if (!dataset) {
|
|
13
|
+
console.error("Naruto dataset not found. Please run a search first.");
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
const customDir = path.join(projectRoot, "naruto-quotes");
|
|
17
|
+
const mockFile = path.join(projectRoot, "data", "raw", "naruto_test_export.csv");
|
|
18
|
+
if (!fs.existsSync(path.dirname(mockFile))) {
|
|
19
|
+
fs.mkdirSync(path.dirname(mockFile), { recursive: true });
|
|
20
|
+
}
|
|
21
|
+
fs.writeFileSync(mockFile, "quote,character\nBelieve it!,Naruto");
|
|
22
|
+
console.log(`Exporting ${dataset.id} to ${customDir}...`);
|
|
23
|
+
const finalPath = await installService.install(dataset.id, mockFile, customDir);
|
|
24
|
+
console.log(`✅ Success! Exported to: ${finalPath}`);
|
|
25
|
+
if (fs.existsSync(finalPath) && finalPath.includes("naruto-quotes")) {
|
|
26
|
+
console.log("Path verification: PASSED");
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
console.error("Path verification: FAILED");
|
|
30
|
+
}
|
|
31
|
+
console.log("\n--- Test Complete ---");
|
|
32
|
+
}
|
|
33
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { DataExporter } from "../export/exporter.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Data Exporter Test ===\n");
|
|
6
|
+
const exporter = new DataExporter();
|
|
7
|
+
const testDir = path.join(process.cwd(), "test_export");
|
|
8
|
+
if (!fs.existsSync(testDir))
|
|
9
|
+
fs.mkdirSync(testDir);
|
|
10
|
+
const inputFile = path.join(testDir, "input.csv");
|
|
11
|
+
// Create Dummy Data
|
|
12
|
+
let csvContent = "id,name,value\n";
|
|
13
|
+
for (let i = 0; i < 10; i++)
|
|
14
|
+
csvContent += `${i},item_${i},${Math.random()}\n`;
|
|
15
|
+
fs.writeFileSync(inputFile, csvContent);
|
|
16
|
+
console.log(`Created input file: ${inputFile}`);
|
|
17
|
+
const formats = ["parquet", "jsonl", "arrow"];
|
|
18
|
+
for (const fmt of formats) {
|
|
19
|
+
const outputFile = path.join(testDir, `output.${fmt}`);
|
|
20
|
+
console.log(`\n--- Exporting to ${fmt.toUpperCase()} ---`);
|
|
21
|
+
try {
|
|
22
|
+
const result = await exporter.export(inputFile, outputFile, fmt);
|
|
23
|
+
console.log("Result:", result);
|
|
24
|
+
if (fs.existsSync(outputFile)) {
|
|
25
|
+
const stats = fs.statSync(outputFile);
|
|
26
|
+
console.log(`PASS: File created (${stats.size} bytes)`);
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
console.error("FAIL: Output file not found");
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
catch (e) {
|
|
33
|
+
console.error(`FAIL: Export to ${fmt} failed:`, e);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// specific Arrow test
|
|
37
|
+
// specific TFRecord test (might fail if no tensorflow)
|
|
38
|
+
console.log(`\n--- Exporting to TFRECORD (Optional) ---`);
|
|
39
|
+
const tfFile = path.join(testDir, "output.tfrecord");
|
|
40
|
+
try {
|
|
41
|
+
const result = await exporter.export(inputFile, tfFile, "tfrecord");
|
|
42
|
+
console.log("Result:", result);
|
|
43
|
+
if (fs.existsSync(tfFile)) {
|
|
44
|
+
console.log("PASS: TFRecord file created");
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
console.log("SKIP: TFRecord export failed (likely no tensorflow installed):", e.message || e);
|
|
49
|
+
}
|
|
50
|
+
// Cleanup
|
|
51
|
+
// fs.rmSync(testDir, { recursive: true, force: true });
|
|
52
|
+
}
|
|
53
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { FusionOrchestrator } from "../fusion/orchestrator.js";
|
|
2
|
+
import * as fs from "fs";
|
|
3
|
+
async function runTest() {
|
|
4
|
+
const ds1 = {
|
|
5
|
+
id: "ds1",
|
|
6
|
+
name: "Dataset 1",
|
|
7
|
+
source: "huggingface",
|
|
8
|
+
columns: [
|
|
9
|
+
{ name: "text", type: "string" },
|
|
10
|
+
{ name: "label", type: "int", is_target: true }
|
|
11
|
+
],
|
|
12
|
+
// ... other required fields (using type assertion for brevity in test)
|
|
13
|
+
};
|
|
14
|
+
const ds2 = {
|
|
15
|
+
id: "ds2",
|
|
16
|
+
name: "Dataset 2",
|
|
17
|
+
source: "kaggle",
|
|
18
|
+
columns: [
|
|
19
|
+
{ name: "sentence", type: "string" },
|
|
20
|
+
{ name: "sentiment", type: "string", is_target: true }
|
|
21
|
+
],
|
|
22
|
+
};
|
|
23
|
+
const config = {
|
|
24
|
+
target_column: "target",
|
|
25
|
+
column_aliases: {
|
|
26
|
+
"text": ["sentence", "content"],
|
|
27
|
+
"target": ["label", "sentiment", "target"]
|
|
28
|
+
},
|
|
29
|
+
type_overrides: {},
|
|
30
|
+
dedupe_config: {
|
|
31
|
+
exact: true,
|
|
32
|
+
fuzzy: true,
|
|
33
|
+
fuzzy_threshold: 0.6,
|
|
34
|
+
fuzzy_columns: ["text"]
|
|
35
|
+
},
|
|
36
|
+
label_map: {
|
|
37
|
+
"pos": 1,
|
|
38
|
+
"neg": 0,
|
|
39
|
+
"0": 0,
|
|
40
|
+
"1": 1
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
const records = [
|
|
44
|
+
{ datasetId: "ds1", record: { text: "I love this!", label: 1 } },
|
|
45
|
+
{ datasetId: "ds1", record: { text: "This is bad.", label: 0 } },
|
|
46
|
+
{ datasetId: "ds2", record: { sentence: "I love this!", sentiment: "pos" } }, // Exact duplicate (after alignment)
|
|
47
|
+
{ datasetId: "ds2", record: { sentence: "I really love this!", sentiment: "pos" } }, // Fuzzy duplicate
|
|
48
|
+
{ datasetId: "ds2", record: { sentence: "It was okay.", sentiment: "neg" } },
|
|
49
|
+
];
|
|
50
|
+
const orchestrator = new FusionOrchestrator(config);
|
|
51
|
+
const result = await orchestrator.fuse([ds1, ds2], records);
|
|
52
|
+
fs.writeFileSync("test-fusion-results.json", JSON.stringify(result, null, 2));
|
|
53
|
+
console.log("Results written to test-fusion-results.json");
|
|
54
|
+
if (result.stats.total_output_rows === 3) {
|
|
55
|
+
console.log("✅ Success: Rows correctly fused and deduplicated.");
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
console.error(`❌ Failure: Expected 3 rows, got ${result.stats.total_output_rows}`);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
2
|
+
async function runTest() {
|
|
3
|
+
console.log("--- Testing GitHub integration ---");
|
|
4
|
+
const scraper = new GitHubScraper();
|
|
5
|
+
const query = "covid-19";
|
|
6
|
+
console.log(`Searching GitHub for: "${query}"...`);
|
|
7
|
+
const results = await scraper.scrape(query, 5);
|
|
8
|
+
console.log(`Found ${results.length} datasets.`);
|
|
9
|
+
if (results.length > 0) {
|
|
10
|
+
console.log("✅ GitHub Scraper returned results.");
|
|
11
|
+
console.log("Sample Result:");
|
|
12
|
+
console.log(JSON.stringify(results[0], null, 2));
|
|
13
|
+
// Validation
|
|
14
|
+
const sample = results[0];
|
|
15
|
+
if (sample.id.startsWith("github:") && sample.source === "github") {
|
|
16
|
+
console.log("✅ Metadata schema validation passed.");
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
console.error("❌ Metadata schema validation failed.");
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
console.error("❌ No results found or rate limit hit.");
|
|
24
|
+
}
|
|
25
|
+
console.log("--- Test Complete ---");
|
|
26
|
+
}
|
|
27
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Group Split Test ===\n");
|
|
6
|
+
const splitter = new DataSplitter();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_group_split.csv");
|
|
8
|
+
// Create Dummy Data (100 rows, 10 groups of 10 rows each)
|
|
9
|
+
let csvContent = "id,group_id,value\n";
|
|
10
|
+
for (let g = 0; g < 10; g++) {
|
|
11
|
+
for (let i = 0; i < 10; i++) {
|
|
12
|
+
csvContent += `${g * 10 + i},group_${g},${Math.random()}\n`;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
fs.writeFileSync(testFile, csvContent);
|
|
16
|
+
console.log(`Created test file with 10 distinct groups.`);
|
|
17
|
+
// Test: Group-based Split (60/20/20)
|
|
18
|
+
const config = {
|
|
19
|
+
type: "group",
|
|
20
|
+
ratios: { train: 0.6, val: 0.2, test: 0.2, holdout: 0 },
|
|
21
|
+
group_column: "group_id",
|
|
22
|
+
shuffle: true,
|
|
23
|
+
random_seed: 42
|
|
24
|
+
};
|
|
25
|
+
try {
|
|
26
|
+
const result = await splitter.split(testFile, config);
|
|
27
|
+
console.log("Stats:", result.stats);
|
|
28
|
+
// Validation: No group should exist in more than one split
|
|
29
|
+
console.log("\n--- Group Leakage Validation ---");
|
|
30
|
+
const report = await splitter.validate(result.paths, { id_column: "group_id" }); // Checking uniqueness of group_id across splits
|
|
31
|
+
console.log("Validation Report:", report);
|
|
32
|
+
if (!report.leakage_detected) {
|
|
33
|
+
console.log("PASS: No group leakage detected.");
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
console.error("FAIL: Groups leaked across splits!");
|
|
37
|
+
}
|
|
38
|
+
// Cleanup
|
|
39
|
+
Object.values(result.paths).forEach(p => {
|
|
40
|
+
if (fs.existsSync(p))
|
|
41
|
+
fs.unlinkSync(p);
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
catch (e) {
|
|
45
|
+
console.error("Test execution failed:", e);
|
|
46
|
+
}
|
|
47
|
+
finally {
|
|
48
|
+
if (fs.existsSync(testFile))
|
|
49
|
+
fs.unlinkSync(testFile);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { HFDownloader } from "../ingestion/hf-downloader.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { fileURLToPath } from "url";
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
const projectRoot = path.join(__dirname, "..", "..");
|
|
8
|
+
async function testHFDownload() {
|
|
9
|
+
const downloader = new HFDownloader();
|
|
10
|
+
const repoId = "fka/awesome-chatgpt-prompts";
|
|
11
|
+
console.log(`Testing HF Download for ${repoId}...`);
|
|
12
|
+
const bestFile = await downloader.findBestFile(repoId);
|
|
13
|
+
console.log(`Best file found: ${bestFile}`);
|
|
14
|
+
if (bestFile) {
|
|
15
|
+
const testDir = path.join(projectRoot, "data", "test");
|
|
16
|
+
if (!fs.existsSync(testDir))
|
|
17
|
+
fs.mkdirSync(testDir, { recursive: true });
|
|
18
|
+
const targetPath = path.join(testDir, "awesome-prompts.csv");
|
|
19
|
+
await downloader.download(repoId, bestFile, targetPath, (p) => {
|
|
20
|
+
process.stdout.write(`\rProgress: ${p}%`);
|
|
21
|
+
});
|
|
22
|
+
console.log(`\nDownload complete! Size: ${fs.statSync(targetPath).size} bytes`);
|
|
23
|
+
console.log(`Location: ${targetPath}`);
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
console.error("No file found!");
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
testHFDownload().catch(console.error);
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { DataSplitter } from "../splitting/splitter.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Holdout Set Manager Test ===\n");
|
|
6
|
+
const splitter = new DataSplitter();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_holdout.csv");
|
|
8
|
+
// Create Dummy Data (200 rows)
|
|
9
|
+
let csvContent = "id,label,date\n";
|
|
10
|
+
for (let i = 0; i < 100; i++)
|
|
11
|
+
csvContent += `${i},A,2023-01-${(i % 30) + 1}\n`;
|
|
12
|
+
for (let i = 100; i < 200; i++)
|
|
13
|
+
csvContent += `${i},B,2023-02-${(i % 28) + 1}\n`;
|
|
14
|
+
fs.writeFileSync(testFile, csvContent);
|
|
15
|
+
console.log(`Created test file: ${testFile}`);
|
|
16
|
+
// Test: 4-way Random Split (70/10/10/10)
|
|
17
|
+
console.log("\n--- Test: 4-way Random Split (70/10/10/10) ---");
|
|
18
|
+
const config = {
|
|
19
|
+
type: "random",
|
|
20
|
+
ratios: { train: 0.7, val: 0.1, test: 0.1, holdout: 0.1 },
|
|
21
|
+
shuffle: true,
|
|
22
|
+
random_seed: 42
|
|
23
|
+
};
|
|
24
|
+
try {
|
|
25
|
+
const result = await splitter.split(testFile, config);
|
|
26
|
+
console.log("Stats:", result.stats);
|
|
27
|
+
const expected = { train: 140, val: 20, test: 20, holdout: 20 };
|
|
28
|
+
if (result.stats.train_rows === expected.train &&
|
|
29
|
+
result.stats.val_rows === expected.val &&
|
|
30
|
+
result.stats.test_rows === expected.test &&
|
|
31
|
+
result.stats.holdout_rows === expected.holdout) {
|
|
32
|
+
console.log("PASS: 4-way ratios preserved");
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
console.error(`FAIL: Ratios mismatch. Expected ${JSON.stringify(expected)}, got ${JSON.stringify(result.stats)}`);
|
|
36
|
+
}
|
|
37
|
+
// Validate Split for Leakage
|
|
38
|
+
console.log("\n--- Validation Check ---");
|
|
39
|
+
const report = await splitter.validate(result.paths, { id_column: "id" });
|
|
40
|
+
console.log("Report:", report);
|
|
41
|
+
if (!report.leakage_detected) {
|
|
42
|
+
console.log("PASS: No leakage between 4 splits");
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
console.error("FAIL: Leakage detected!");
|
|
46
|
+
}
|
|
47
|
+
// Cleanup
|
|
48
|
+
Object.values(result.paths).forEach(p => {
|
|
49
|
+
if (fs.existsSync(p))
|
|
50
|
+
fs.unlinkSync(p);
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
catch (e) {
|
|
54
|
+
console.error("Test execution failed:", e);
|
|
55
|
+
}
|
|
56
|
+
finally {
|
|
57
|
+
if (fs.existsSync(testFile))
|
|
58
|
+
fs.unlinkSync(testFile);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { SearchEngine } from "../search/engine.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
+
import { Embedder } from "../search/embedder.js";
|
|
5
|
+
import path from "path";
|
|
6
|
+
async function main() {
|
|
7
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
8
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
9
|
+
const store = new MetadataStore(dbPath);
|
|
10
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
11
|
+
const embedder = Embedder.getInstance();
|
|
12
|
+
await embedder.init();
|
|
13
|
+
const engine = new SearchEngine(store, vectorStore, embedder);
|
|
14
|
+
console.log("\n=== Hybrid Search Precision Test ===\n");
|
|
15
|
+
// Test 1: Financial Forecasting (Drift Check)
|
|
16
|
+
// Should NOT have crypto in top results if hybrid logic works
|
|
17
|
+
console.log("Test 1: 'financial forecasting' (Should penalize Crypto)");
|
|
18
|
+
const results1 = await engine.search("financial forecasting", { limit: 5 });
|
|
19
|
+
results1.forEach((r, i) => {
|
|
20
|
+
const score = r.relevance_score;
|
|
21
|
+
const vector = r.vector_score;
|
|
22
|
+
const lexical = r.lexical_score;
|
|
23
|
+
console.log(` ${i + 1}. [${score}] ${r.name} (Vec: ${vector}, Lex: ${lexical})`);
|
|
24
|
+
if (r.name.toLowerCase().includes("crypto") || r.description.toLowerCase().includes("bitcoin")) {
|
|
25
|
+
console.error(" CRITICAL: Crypto found in top results!");
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
console.log("");
|
|
29
|
+
// Test 2: Negative Keywords
|
|
30
|
+
console.log("Test 2: 'financial forecasting -stock' (Should exclude 'stock')");
|
|
31
|
+
const results2 = await engine.search("financial forecasting -stock", { limit: 5 });
|
|
32
|
+
results2.forEach((r, i) => {
|
|
33
|
+
console.log(` ${i + 1}. ${r.name}`);
|
|
34
|
+
if (r.name.toLowerCase().includes("stock") || r.description.toLowerCase().includes("stock")) {
|
|
35
|
+
console.error(" ERROR: Negative keyword failed!");
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
console.log("\nDone.");
|
|
39
|
+
store.close();
|
|
40
|
+
}
|
|
41
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { ImageAnalyzer } from "../quality/image-analyzer.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { execSync } from "child_process";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Testing Image Quality Analysis ---");
|
|
7
|
+
const projectRoot = path.resolve(".");
|
|
8
|
+
const analyzer = new ImageAnalyzer(projectRoot);
|
|
9
|
+
// 1. Create a sample image using Python (to avoid external dependencies)
|
|
10
|
+
const testImageDir = path.join(projectRoot, "data", "test-images");
|
|
11
|
+
if (!fs.existsSync(testImageDir))
|
|
12
|
+
fs.mkdirSync(testImageDir, { recursive: true });
|
|
13
|
+
const imagePath = path.join(testImageDir, "test_v1.png");
|
|
14
|
+
console.log("Generating test image...");
|
|
15
|
+
const pythonScript = `
|
|
16
|
+
from PIL import Image, ImageDraw
|
|
17
|
+
import numpy as np
|
|
18
|
+
img = Image.new('RGB', (800, 600), color = (73, 109, 137))
|
|
19
|
+
d = ImageDraw.Draw(img)
|
|
20
|
+
d.text((10,10), "Vesper Test Image", fill=(255,255,0))
|
|
21
|
+
# Add some noise for blur check
|
|
22
|
+
noise = np.random.randint(0, 255, (600, 800, 3), dtype='uint8')
|
|
23
|
+
img_np = np.array(img)
|
|
24
|
+
img_np = (img_np * 0.5 + noise * 0.5).astype('uint8')
|
|
25
|
+
Image.fromarray(img_np).save('${imagePath.replace(/\\/g, "\\\\")}')
|
|
26
|
+
`;
|
|
27
|
+
fs.writeFileSync(path.join(testImageDir, "gen_image.py"), pythonScript);
|
|
28
|
+
execSync(`python "${path.join(testImageDir, "gen_image.py")}"`);
|
|
29
|
+
// 2. Run Analysis
|
|
30
|
+
console.log(`Analyzing ${imagePath}...`);
|
|
31
|
+
try {
|
|
32
|
+
const report = await analyzer.analyze(imagePath);
|
|
33
|
+
console.log("Analysis Result:");
|
|
34
|
+
console.log(`- Format: ${report.individual_results[0].format}`);
|
|
35
|
+
console.log(`- Resolution: ${report.individual_results[0].width}x${report.individual_results[0].height}`);
|
|
36
|
+
console.log(`- Blur Score: ${report.individual_results[0].blur_score}`);
|
|
37
|
+
console.log(`- Is Blurry: ${report.individual_results[0].is_blurry}`);
|
|
38
|
+
if (report.total_images === 1 && report.average_width === 800) {
|
|
39
|
+
console.log("\nVERIFICATION_STATUS: PASS");
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
console.log("\nVERIFICATION_STATUS: FAIL - Incorrect stats");
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
catch (e) {
|
|
46
|
+
console.error(`Analysis failed: ${e.message}`);
|
|
47
|
+
console.log("\nVERIFICATION_STATUS: FAIL");
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
runTest().catch(console.error);
|