@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { DataIngestor } from "../ingestion/ingestor.js";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import fs from "fs";
|
|
5
|
+
import { fileURLToPath } from "url";
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = path.dirname(__filename);
|
|
8
|
+
const projectRoot = path.join(__dirname, "..", "..");
|
|
9
|
+
const dbPath = path.join(projectRoot, "data", "test-metadata.db");
|
|
10
|
+
// Clean test DB if exists
|
|
11
|
+
if (fs.existsSync(dbPath))
|
|
12
|
+
fs.unlinkSync(dbPath);
|
|
13
|
+
const store = new MetadataStore(dbPath);
|
|
14
|
+
const ingestor = new DataIngestor(projectRoot, store);
|
|
15
|
+
async function testInfra() {
|
|
16
|
+
console.log("--- Testing Ingestion Infrastructure (6.1) ---");
|
|
17
|
+
const testId = "test/dataset";
|
|
18
|
+
// 1. Get target path
|
|
19
|
+
const target = ingestor.getTargetPath(testId);
|
|
20
|
+
console.log(`Target path: ${target}`);
|
|
21
|
+
// 2. Register downloading
|
|
22
|
+
console.log("Registering download start...");
|
|
23
|
+
store.registerDownload(testId, target, 'downloading');
|
|
24
|
+
let status = store.getDownloadStatus(testId);
|
|
25
|
+
console.log("Status after start:", status?.status);
|
|
26
|
+
// 3. Register completion
|
|
27
|
+
console.log("Registering completion...");
|
|
28
|
+
ingestor.completeDownload(testId, target, 1024);
|
|
29
|
+
status = store.getDownloadStatus(testId);
|
|
30
|
+
console.log("Status after completion:", status?.status);
|
|
31
|
+
console.log("Saved path:", status?.local_path);
|
|
32
|
+
console.log("Size:", status?.size_bytes, "bytes");
|
|
33
|
+
// 4. Verify directory exists
|
|
34
|
+
const rawDir = path.join(projectRoot, "data", "raw");
|
|
35
|
+
console.log(`Raw data dir created: ${fs.existsSync(rawDir)}`);
|
|
36
|
+
console.log("\nInfrastructure test PASSED!");
|
|
37
|
+
store.close();
|
|
38
|
+
}
|
|
39
|
+
testInfra().catch(console.error);
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { InstallService } from "../install/install-service.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import fs from "fs";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Testing Automatic Installation ---");
|
|
7
|
+
const projectRoot = path.resolve(".");
|
|
8
|
+
const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
|
|
9
|
+
const installService = new InstallService(projectRoot, metadataStore);
|
|
10
|
+
const query = "naruto";
|
|
11
|
+
const dataset = metadataStore.db.prepare("SELECT * FROM datasets WHERE name LIKE ? LIMIT 1").get(`%${query}%`);
|
|
12
|
+
if (!dataset) {
|
|
13
|
+
console.error("Naruto dataset not found in metadata.db. Please run a search first.");
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
console.log(`Found dataset: ${dataset.name}`);
|
|
17
|
+
// Mock a prepared file
|
|
18
|
+
const mockFile = path.join(projectRoot, "data", "raw", "naruto_mock.csv");
|
|
19
|
+
if (!fs.existsSync(path.dirname(mockFile))) {
|
|
20
|
+
fs.mkdirSync(path.dirname(mockFile), { recursive: true });
|
|
21
|
+
}
|
|
22
|
+
fs.writeFileSync(mockFile, "quote,character\nBelieve it!,Naruto\nI will be Hokage,Naruto");
|
|
23
|
+
console.log(`Installing ${mockFile}...`);
|
|
24
|
+
const installPath = await installService.install(dataset.id, mockFile);
|
|
25
|
+
console.log(`✅ Success! Installed to: ${installPath}`);
|
|
26
|
+
// Verify it exists
|
|
27
|
+
if (fs.existsSync(installPath)) {
|
|
28
|
+
console.log("File exists at install location.");
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
console.error("File MISSING from install location!");
|
|
32
|
+
}
|
|
33
|
+
// Verify metadata updated
|
|
34
|
+
const updated = metadataStore.getDataset(dataset.id);
|
|
35
|
+
const success = updated?.install_path === installPath;
|
|
36
|
+
console.log(`Updated install_path in metadata: ${updated?.install_path}`);
|
|
37
|
+
console.log(`VERIFICATION_STATUS: ${success ? "✅ PASS" : "❌ FAIL"}`);
|
|
38
|
+
console.log("\n--- Test Complete ---");
|
|
39
|
+
}
|
|
40
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
2
|
+
async function runTest() {
|
|
3
|
+
console.log("--- Testing Institutional Data Sources ---");
|
|
4
|
+
const wb = new WorldBankScraper();
|
|
5
|
+
const nasa = new NASAScraper();
|
|
6
|
+
console.log('Searching World Bank for: "climate"...');
|
|
7
|
+
const wbResults = await wb.scrape("climate", 3);
|
|
8
|
+
console.log(`Found ${wbResults.length} World Bank results.`);
|
|
9
|
+
if (wbResults.length > 0) {
|
|
10
|
+
console.log("✅ World Bank Sample:");
|
|
11
|
+
console.log(` - ID: ${wbResults[0].id}`);
|
|
12
|
+
console.log(` - Name: ${wbResults[0].name}`);
|
|
13
|
+
console.log(` - URL: ${wbResults[0].metadata_url}`);
|
|
14
|
+
}
|
|
15
|
+
console.log('\nSearching NASA for: "astronomy"...');
|
|
16
|
+
const nasaResults = await nasa.scrape("astronomy", 3);
|
|
17
|
+
console.log(`Found ${nasaResults.length} NASA results.`);
|
|
18
|
+
if (nasaResults.length > 0) {
|
|
19
|
+
console.log("✅ NASA Sample:");
|
|
20
|
+
console.log(` - ID: ${nasaResults[0].id}`);
|
|
21
|
+
console.log(` - Name: ${nasaResults[0].name}`);
|
|
22
|
+
console.log(` - URL: ${nasaResults[0].metadata_url}`);
|
|
23
|
+
}
|
|
24
|
+
console.log("\n--- Test Complete ---");
|
|
25
|
+
}
|
|
26
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { QualityAnalyzer } from "../quality/analyzer.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
async function main() {
|
|
5
|
+
console.log("=== Vesper Integrity Check Test ===\n");
|
|
6
|
+
const analyzer = new QualityAnalyzer();
|
|
7
|
+
const testFile = path.join(process.cwd(), "test_integrity.csv");
|
|
8
|
+
// Create a CSV with unique IDs but duplicate CONTENT
|
|
9
|
+
// This simulates a common data issue: exact same review scraped twice with different IDs
|
|
10
|
+
const csvContent = `id,review,label
|
|
11
|
+
1,"This product involves huge risks.",negative
|
|
12
|
+
2,"Great investment opportunity!",positive
|
|
13
|
+
3,"This product involves huge risks.",negative
|
|
14
|
+
4,"Wait and see.",neutral
|
|
15
|
+
5,"Great investment opportunity!",positive`;
|
|
16
|
+
fs.writeFileSync(testFile, csvContent);
|
|
17
|
+
console.log(`Created test file: ${testFile}`);
|
|
18
|
+
console.log("Expectations:");
|
|
19
|
+
console.log(" - Duplicate Rows: 0 (because IDs differ)");
|
|
20
|
+
console.log(" - Text Duplicates: > 0 (because 'review' column has dupes)");
|
|
21
|
+
try {
|
|
22
|
+
console.log("\nRunning analyzer...");
|
|
23
|
+
const report = await analyzer.analyze(testFile);
|
|
24
|
+
console.log("\n=== Integrity Report ===");
|
|
25
|
+
console.log(`Duplicate Rows (Exact): ${report.duplicate_rows}`);
|
|
26
|
+
console.log(`Text Duplicates (Content): ${report.text_duplicates || 0}`);
|
|
27
|
+
console.log("\nWarnings:", report.warnings);
|
|
28
|
+
if (report.text_duplicates && report.text_duplicates > 0) {
|
|
29
|
+
console.log("Test passed! Detected text duplication despite unique IDs.");
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
console.error("Test failed! Did not detect text duplicates.");
|
|
33
|
+
}
|
|
34
|
+
// Cleanup
|
|
35
|
+
fs.unlinkSync(testFile);
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
console.error("\nTest failed:", error);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { SearchEngine } from "../search/engine.js";
|
|
2
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
3
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
4
|
+
import { Embedder } from "../search/embedder.js";
|
|
5
|
+
import path from "path";
|
|
6
|
+
/**
|
|
7
|
+
* Test JIT fallback with various queries
|
|
8
|
+
*/
|
|
9
|
+
async function main() {
|
|
10
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
11
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
12
|
+
const store = new MetadataStore(dbPath);
|
|
13
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
14
|
+
const embedder = Embedder.getInstance();
|
|
15
|
+
await embedder.init();
|
|
16
|
+
const engine = new SearchEngine(store, vectorStore, embedder);
|
|
17
|
+
console.log("\n=== JIT Fallback Test Suite ===\n");
|
|
18
|
+
// Test 1: Query that should have good results (should NOT trigger JIT)
|
|
19
|
+
console.log("Test 1: Common query (should NOT trigger JIT)");
|
|
20
|
+
console.log("Query: 'image classification cats dogs'\n");
|
|
21
|
+
const results1 = await engine.search("image classification cats dogs", { limit: 5 });
|
|
22
|
+
console.log(`Results: ${results1.length}, Top score: ${results1[0]?.relevance_score || 0}`);
|
|
23
|
+
console.log("---\n");
|
|
24
|
+
// Test 2: Very niche query (should trigger JIT)
|
|
25
|
+
console.log("Test 2: Niche query (SHOULD trigger JIT)");
|
|
26
|
+
console.log("Query: 'underwater acoustic signal processing dolphins'\n");
|
|
27
|
+
const results2 = await engine.search("underwater acoustic signal processing dolphins", { limit: 5 });
|
|
28
|
+
console.log(`Results: ${results2.length}, Top score: ${results2[0]?.relevance_score || 0}`);
|
|
29
|
+
console.log("---\n");
|
|
30
|
+
// Test 3: Another niche query
|
|
31
|
+
console.log("Test 3: Another niche query (SHOULD trigger JIT)");
|
|
32
|
+
console.log("Query: 'mongolian language morphological analysis'\n");
|
|
33
|
+
const results3 = await engine.search("mongolian language morphological analysis", { limit: 5 });
|
|
34
|
+
console.log(`Results: ${results3.length}, Top score: ${results3[0]?.relevance_score || 0}`);
|
|
35
|
+
console.log("---\n");
|
|
36
|
+
// Check final dataset count
|
|
37
|
+
const finalCount = store.getAllDatasets().length;
|
|
38
|
+
console.log(`\nFinal dataset count: ${finalCount}`);
|
|
39
|
+
console.log("(Compare with initial 1238 to see if JIT added new datasets)\n");
|
|
40
|
+
store.close();
|
|
41
|
+
}
|
|
42
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { JobManager } from "../jobs/manager.js";
|
|
3
|
+
async function runTest() {
|
|
4
|
+
console.log("--- Initializing Job Queue Test ---");
|
|
5
|
+
const store = new MetadataStore("data/vesper_test_jobs.db");
|
|
6
|
+
const manager = JobManager.getInstance(store);
|
|
7
|
+
manager.setConcurrency(2); // 2 parallel workers
|
|
8
|
+
const jobsFinished = [];
|
|
9
|
+
// Listener to simulate job processing
|
|
10
|
+
manager.on("processJob", async (job, run) => {
|
|
11
|
+
console.log(`[Worker] Starting Job: ${job.id} (Type: ${job.type}, Priority: ${job.priority})`);
|
|
12
|
+
await run(async () => {
|
|
13
|
+
// Simulate variable workload
|
|
14
|
+
const duration = job.priority === 10 ? 500 : 2000;
|
|
15
|
+
await new Promise(r => setTimeout(r, duration));
|
|
16
|
+
// Simulate failure for a specific job to test retries
|
|
17
|
+
if (job.metadata === "FAIL_ONCE" && job.attempts === 0) {
|
|
18
|
+
console.log(`[Worker] Simulating failure for job ${job.id}`);
|
|
19
|
+
throw new Error("Transitory error");
|
|
20
|
+
}
|
|
21
|
+
console.log(`[Worker] Finished Job: ${job.id}`);
|
|
22
|
+
jobsFinished.push(job.id);
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
console.log("\n--- Enqueuing Jobs ---");
|
|
26
|
+
// 1. A slow low-priority job
|
|
27
|
+
const j1 = manager.createJob("prepare", 0, "slow-1");
|
|
28
|
+
// 2. A fast high-priority job (Pro user)
|
|
29
|
+
const j2 = manager.createJob("clean", 10, "pro-1");
|
|
30
|
+
// 3. Another low-priority job
|
|
31
|
+
const j3 = manager.createJob("split", 0, "slow-2");
|
|
32
|
+
// 4. A job that fails once
|
|
33
|
+
const j4 = manager.createJob("fusion", 5, "FAIL_ONCE");
|
|
34
|
+
console.log(`Enqueued 4 jobs. Concurrency is 2.`);
|
|
35
|
+
// Wait for all to finish
|
|
36
|
+
return new Promise((resolve) => {
|
|
37
|
+
const check = setInterval(() => {
|
|
38
|
+
const stats = manager.queue.getStats();
|
|
39
|
+
if (stats.total === 0 && jobsFinished.includes(j4.id)) {
|
|
40
|
+
clearInterval(check);
|
|
41
|
+
console.log("\n--- Test Results ---");
|
|
42
|
+
console.log("Execution Order:", jobsFinished);
|
|
43
|
+
const proIndex = jobsFinished.indexOf(j2.id);
|
|
44
|
+
const slowIndex = jobsFinished.indexOf(j3.id);
|
|
45
|
+
if (proIndex < slowIndex) {
|
|
46
|
+
console.log("✅ Priority verified: Pro job finished before later low-priority jobs.");
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
console.warn("⚠️ Priority check failed or inconclusive due to parallel timing.");
|
|
50
|
+
}
|
|
51
|
+
const j4_final = store.getJob(j4.id);
|
|
52
|
+
if (j4_final?.attempts === 1) {
|
|
53
|
+
console.log("✅ Retry logic verified: Job retried after failure.");
|
|
54
|
+
}
|
|
55
|
+
console.log("\n✅ Success: Job queue system verified.");
|
|
56
|
+
store.close();
|
|
57
|
+
resolve(null);
|
|
58
|
+
}
|
|
59
|
+
}, 1000);
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { KaggleDownloader } from "../ingestion/kaggle-downloader.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { fileURLToPath } from "url";
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
const projectRoot = path.join(__dirname, "..", "..");
|
|
8
|
+
async function testKaggleDownload() {
|
|
9
|
+
// Try to get credentials from env
|
|
10
|
+
const user = process.env.KAGGLE_USERNAME;
|
|
11
|
+
const key = process.env.KAGGLE_KEY;
|
|
12
|
+
if (!user || !key) {
|
|
13
|
+
console.error("KAGGLE_USERNAME and KAGGLE_KEY must be set to run this test.");
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
const downloader = new KaggleDownloader(user, key);
|
|
17
|
+
const repoId = "shivam2503/diamonds"; // Small classic dataset
|
|
18
|
+
console.log(`Testing Kaggle Download for ${repoId}...`);
|
|
19
|
+
const targetDir = path.join(projectRoot, "data", "test", "kaggle_diamonds");
|
|
20
|
+
if (fs.existsSync(targetDir))
|
|
21
|
+
fs.rmSync(targetDir, { recursive: true });
|
|
22
|
+
try {
|
|
23
|
+
const bestFile = await downloader.download(repoId, targetDir, (p) => {
|
|
24
|
+
process.stdout.write(`\rProgress: ${p}%`);
|
|
25
|
+
});
|
|
26
|
+
console.log(`\nDownload complete!`);
|
|
27
|
+
console.log(`Best file found: ${bestFile}`);
|
|
28
|
+
console.log(`Size: ${fs.statSync(bestFile).size} bytes`);
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
console.error("\nKaggle Download failed:", e.message);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
testKaggleDownload().catch(console.error);
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { StreamProcessor } from "../data/streaming.js";
|
|
2
|
+
import { WorkerPool } from "../data/worker-pool.js";
|
|
3
|
+
import { Readable } from "stream";
|
|
4
|
+
async function runTest() {
|
|
5
|
+
console.log("--- Initializing Large Dataset Handling Test ---");
|
|
6
|
+
// 1. Generate Mock Data (10,000 records)
|
|
7
|
+
const records = Array.from({ length: 10000 }, (_, i) => ({
|
|
8
|
+
id: i,
|
|
9
|
+
name: `Record ${i}`,
|
|
10
|
+
value: Math.random() * 100,
|
|
11
|
+
category: i % 2 === 0 ? "A" : "B"
|
|
12
|
+
}));
|
|
13
|
+
console.log(`Generated ${records.length} mock records.`);
|
|
14
|
+
// 2. Test Sampling Strategy
|
|
15
|
+
console.log("\n--- Testing Sampling Strategy (10%) ---");
|
|
16
|
+
const isSampled = StreamProcessor.createSampler(10);
|
|
17
|
+
const sample = records.filter(isSampled);
|
|
18
|
+
console.log(`Sample size: ${sample.length} (Expected ~1000)`);
|
|
19
|
+
// 3. Test Streaming Utility
|
|
20
|
+
console.log("\n--- Testing Streaming Utility (Chunk size: 2000) ---");
|
|
21
|
+
const stream = Readable.from(records);
|
|
22
|
+
let chunkCount = 0;
|
|
23
|
+
const processChunk = async (chunk) => {
|
|
24
|
+
chunkCount++;
|
|
25
|
+
console.log(`Processing chunk ${chunkCount} of size ${chunk.length}`);
|
|
26
|
+
return chunk.map(r => ({ ...r, status: "streamed" }));
|
|
27
|
+
};
|
|
28
|
+
const streamedResults = await StreamProcessor.processInChunks(stream, { chunkSize: 2000 }, processChunk);
|
|
29
|
+
console.log(`Total streamed records: ${streamedResults.length}`);
|
|
30
|
+
// 4. Test Parallel Worker Pool
|
|
31
|
+
console.log("\n--- Testing Parallel Worker Pool (Optional) ---");
|
|
32
|
+
try {
|
|
33
|
+
const pool = new WorkerPool(4);
|
|
34
|
+
console.log("Running parallel transformation on all records...");
|
|
35
|
+
const start = Date.now();
|
|
36
|
+
const parallelResults = await pool.processParallel(records, 2500);
|
|
37
|
+
const end = Date.now();
|
|
38
|
+
console.log(`Parallel processing took ${end - start}ms`);
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
console.warn(" Parallel Worker Pool skipped or failed (Common with tsx + worker_threads). Streaming and Sampling are still functional.");
|
|
42
|
+
}
|
|
43
|
+
if (streamedResults.length === 10000) {
|
|
44
|
+
console.log("\n✅ Success: Chunked streaming and sampling verified.");
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
console.error("\n❌ Failure: Processing mismatch.");
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { JobManager } from "../jobs/manager.js";
|
|
3
|
+
import { SearchEngine } from "../search/engine.js";
|
|
4
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
5
|
+
import { Embedder } from "../search/embedder.js";
|
|
6
|
+
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
7
|
+
import path from "path";
|
|
8
|
+
import fs from "fs";
|
|
9
|
+
async function testPhase5Tools() {
|
|
10
|
+
console.log(" Testing Phase 5.2: New MCP Tools Integration\n");
|
|
11
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
12
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
13
|
+
const store = new MetadataStore(dbPath);
|
|
14
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
15
|
+
const embedder = Embedder.getInstance();
|
|
16
|
+
const searchEngine = new SearchEngine(store, vectorStore, embedder);
|
|
17
|
+
const jobManager = JobManager.getInstance(store);
|
|
18
|
+
const executor = new PipelineExecutor();
|
|
19
|
+
// 1. Test compare_datasets
|
|
20
|
+
console.log("Step 1: Testing compare_datasets...");
|
|
21
|
+
const allDs = store.getAllDatasets();
|
|
22
|
+
if (allDs.length < 2) {
|
|
23
|
+
console.warn(" ! Need at least 2 datasets in DB to test comparison. Skipping.");
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
const ids = allDs.slice(0, 2).map(d => d.id);
|
|
27
|
+
console.log(` - Comparing: ${ids.join(", ")}`);
|
|
28
|
+
// Simulating comparison table logic from index.ts
|
|
29
|
+
const datasets = ids.map(id => store.getDataset(id)).filter(d => !!d);
|
|
30
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
31
|
+
comparison += "| Quality Score | " + datasets.map(d => d.quality_score).join(" | ") + " |\n";
|
|
32
|
+
console.log(comparison);
|
|
33
|
+
}
|
|
34
|
+
// 2. Test analyze_quality / preview_cleaning (Local logic)
|
|
35
|
+
console.log("\nStep 2: Testing Analysis/Preview (Simulated)...");
|
|
36
|
+
const testFile = path.join(process.cwd(), "e2e_demo_output", "raw_data.csv");
|
|
37
|
+
if (fs.existsSync(testFile)) {
|
|
38
|
+
console.log(" - Testing on e2e_demo_output/raw_data.csv");
|
|
39
|
+
// Logic check
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
console.warn(" ! No test file found at e2e_demo_output/raw_data.csv. Run 'tsx src/scripts/e2e-demo.ts' first.");
|
|
43
|
+
}
|
|
44
|
+
// 3. Test prepare_dataset (Async Job)
|
|
45
|
+
console.log("\nStep 3: Testing prepare_dataset (Autonomous Orchestrator)...");
|
|
46
|
+
const query = "financial data for stock prediction";
|
|
47
|
+
// Create job (Logic from index.ts)
|
|
48
|
+
const job = jobManager.createJob("prepare", 0, { query });
|
|
49
|
+
console.log(` - Job Created: ${job.id}`);
|
|
50
|
+
// Start background task
|
|
51
|
+
console.log(" - Starting autonomous preparation...");
|
|
52
|
+
const jobPromise = jobManager.runJob(job.id, async (update) => {
|
|
53
|
+
update({ progress: 20, status_text: "Searching..." });
|
|
54
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
55
|
+
update({ progress: 50, status_text: "Analyzing quality..." });
|
|
56
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
57
|
+
update({ progress: 80, status_text: "Formatting export..." });
|
|
58
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
59
|
+
return "data/exports/prepared_dataset.parquet";
|
|
60
|
+
});
|
|
61
|
+
// 4. Test check_job_status (Polling)
|
|
62
|
+
console.log("\nStep 4: Polling Job Status (Simulating UI Check)...");
|
|
63
|
+
for (let i = 0; i < 5; i++) {
|
|
64
|
+
const currentJob = store.getJob(job.id);
|
|
65
|
+
console.log(` - [TS +${i}s] Status: ${currentJob?.status}, Progress: ${currentJob?.progress}%, Activity: ${currentJob?.status_text}`);
|
|
66
|
+
if (currentJob?.status === "completed")
|
|
67
|
+
break;
|
|
68
|
+
await new Promise(r => setTimeout(r, 800));
|
|
69
|
+
}
|
|
70
|
+
await jobPromise;
|
|
71
|
+
console.log("\n Phase 5 tools logic verified.");
|
|
72
|
+
}
|
|
73
|
+
testPhase5Tools().catch(console.error);
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { MediaAnalyzer } from "../quality/media-analyzer.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { execSync } from "child_process";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Testing Media Quality Analysis ---");
|
|
7
|
+
const projectRoot = path.resolve(".");
|
|
8
|
+
const analyzer = new MediaAnalyzer(projectRoot);
|
|
9
|
+
// 1. Create a sample audio file using Python (sine wave)
|
|
10
|
+
const testMediaDir = path.join(projectRoot, "data", "test-media");
|
|
11
|
+
if (!fs.existsSync(testMediaDir))
|
|
12
|
+
fs.mkdirSync(testMediaDir, { recursive: true });
|
|
13
|
+
const audioPath = path.join(testMediaDir, "test_audio.wav");
|
|
14
|
+
console.log("Generating test audio file (sine wave)...");
|
|
15
|
+
const pythonScript = `
|
|
16
|
+
import numpy as np
|
|
17
|
+
import soundfile as sf
|
|
18
|
+
sample_rate = 44100
|
|
19
|
+
duration = 2.0
|
|
20
|
+
frequency = 440.0
|
|
21
|
+
t = np.linspace(0, duration, int(sample_rate * duration))
|
|
22
|
+
audio = 0.5 * np.sin(2 * np.pi * frequency * t)
|
|
23
|
+
sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
|
|
24
|
+
`;
|
|
25
|
+
fs.writeFileSync(path.join(testMediaDir, "gen_audio.py"), pythonScript);
|
|
26
|
+
try {
|
|
27
|
+
execSync(`python "${path.join(testMediaDir, "gen_audio.py")}"`);
|
|
28
|
+
}
|
|
29
|
+
catch (e) {
|
|
30
|
+
console.log("⚠️ Could not generate audio (soundfile may not be installed). Skipping audio test.");
|
|
31
|
+
console.log("VERIFICATION_STATUS: ⚠️ PARTIAL (Audio generation failed)");
|
|
32
|
+
return;
|
|
33
|
+
}
|
|
34
|
+
// 2. Run Analysis
|
|
35
|
+
console.log(`Analyzing ${audioPath}...`);
|
|
36
|
+
try {
|
|
37
|
+
const report = await analyzer.analyze(audioPath);
|
|
38
|
+
console.log("Analysis Result:");
|
|
39
|
+
console.log(`- Total Files: ${report.total_files}`);
|
|
40
|
+
console.log(`- OK Files: ${report.ok_files}`);
|
|
41
|
+
if ('avg_audio_duration' in report) {
|
|
42
|
+
console.log(`- Average Audio Duration: ${report.avg_audio_duration}s`);
|
|
43
|
+
const firstAudio = report.details[0];
|
|
44
|
+
if (firstAudio && 'sample_rate' in firstAudio) {
|
|
45
|
+
console.log(`- Sample Rate: ${firstAudio.sample_rate}Hz`);
|
|
46
|
+
console.log(`- Duration: ${firstAudio.duration}s`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
if (report.ok_files === 1 && report.total_files === 1) {
|
|
50
|
+
console.log("\nVERIFICATION_STATUS: ✅ PASS");
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
console.log("\nVERIFICATION_STATUS: ❌ FAIL - Incorrect stats");
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
catch (e) {
|
|
57
|
+
console.error(`Analysis failed: ${e.message}`);
|
|
58
|
+
console.log("\nVERIFICATION_STATUS: ❌ FAIL");
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
2
|
+
import { MonitoringStore } from "../metadata/monitoring-store.js";
|
|
3
|
+
import { MonitoringService } from "../metadata/monitoring-service.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
const dbPath = path.resolve("data", "test-monitoring.db");
|
|
7
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
8
|
+
const monitoringStore = new MonitoringStore(metadataStore.db);
|
|
9
|
+
const service = new MonitoringService(monitoringStore, metadataStore);
|
|
10
|
+
// 1. Setup mock data
|
|
11
|
+
const dsId = "test/dataset-1";
|
|
12
|
+
const initialMetadata = {
|
|
13
|
+
id: dsId,
|
|
14
|
+
source: "huggingface",
|
|
15
|
+
name: "Test Dataset",
|
|
16
|
+
description: "Initial description",
|
|
17
|
+
downloads: 100,
|
|
18
|
+
likes: 10,
|
|
19
|
+
last_updated: "2023-01-01T00:00:00Z",
|
|
20
|
+
download_url: "https://huggingface.co/datasets/test/dataset-1",
|
|
21
|
+
quality_score: 85,
|
|
22
|
+
splits: [{ name: "train", num_examples: 1000 }],
|
|
23
|
+
license: { id: "mit", category: "safe", usage_restrictions: [], warnings: [] },
|
|
24
|
+
tags: [],
|
|
25
|
+
task: "text-classification",
|
|
26
|
+
languages: ["en"],
|
|
27
|
+
is_structured: true,
|
|
28
|
+
has_target_column: true,
|
|
29
|
+
total_examples: 1000,
|
|
30
|
+
is_safe_source: true,
|
|
31
|
+
has_personal_data: false,
|
|
32
|
+
is_paywalled: false,
|
|
33
|
+
is_scraped_web_data: false,
|
|
34
|
+
uses_https: true,
|
|
35
|
+
has_train_split: true,
|
|
36
|
+
has_test_split: false,
|
|
37
|
+
has_validation_split: false,
|
|
38
|
+
description_length: 100,
|
|
39
|
+
has_readme: true,
|
|
40
|
+
quality_warnings: []
|
|
41
|
+
};
|
|
42
|
+
metadataStore.saveDataset(initialMetadata);
|
|
43
|
+
// 2. Setup Monitor and Webhook
|
|
44
|
+
monitoringStore.saveWebhook({
|
|
45
|
+
id: "slack-1",
|
|
46
|
+
name: "General Slack",
|
|
47
|
+
channel: "slack",
|
|
48
|
+
url: "https://hooks.slack.com/services/...",
|
|
49
|
+
enabled: true
|
|
50
|
+
});
|
|
51
|
+
monitoringStore.saveMonitor({
|
|
52
|
+
dataset_id: dsId,
|
|
53
|
+
enabled: true,
|
|
54
|
+
auto_reprocess: true,
|
|
55
|
+
last_checked_version: initialMetadata.last_updated,
|
|
56
|
+
webhook_ids: ["slack-1"],
|
|
57
|
+
created_at: new Date().toISOString(),
|
|
58
|
+
updated_at: new Date().toISOString()
|
|
59
|
+
});
|
|
60
|
+
console.log("--- Monitoring Setup Complete ---");
|
|
61
|
+
// 3. Simulate an update
|
|
62
|
+
const updatedMetadata = {
|
|
63
|
+
...initialMetadata,
|
|
64
|
+
last_updated: "2023-02-01T00:00:00Z", // New version
|
|
65
|
+
downloads: 150,
|
|
66
|
+
quality_score: 90,
|
|
67
|
+
total_examples: 1200 // Significant change
|
|
68
|
+
};
|
|
69
|
+
console.log("Checking for updates...");
|
|
70
|
+
// Mock fetch function
|
|
71
|
+
const mockFetch = async (id, source) => {
|
|
72
|
+
if (id === dsId)
|
|
73
|
+
return updatedMetadata;
|
|
74
|
+
return null;
|
|
75
|
+
};
|
|
76
|
+
const diffs = await service.checkUpdates(mockFetch);
|
|
77
|
+
console.log(`Found ${diffs.length} updates.`);
|
|
78
|
+
if (diffs.length > 0) {
|
|
79
|
+
const diff = diffs[0];
|
|
80
|
+
console.log(`Version Change: ${diff.old_version} -> ${diff.new_version}`);
|
|
81
|
+
console.log(`Impact Score: ${diff.impact_score}`);
|
|
82
|
+
console.log("Changes:", JSON.stringify(diff.changes, null, 2));
|
|
83
|
+
}
|
|
84
|
+
if (diffs.length > 0 && diffs[0].impact_score > 0) {
|
|
85
|
+
console.log("\n✅ Success: Dataset update detected and diffed correctly.");
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
console.error("\n❌ Failure: Update not detected.");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
runTest().catch(console.error);
|